gisp-pp719v2-scraper/app/main.py

import httpx
import re
import os
import time
import asyncio
from fastapi import FastAPI, HTTPException
from typing import List, Optional, Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

app = FastAPI(title="GISP Scraper API (Selenium Mode)")

# Configuration
API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"

async def fetch_gisp_data(registry_number: str):
    payload = {
        "opt": {
            "filter": ["product_reg_number_2023", "contains", registry_number]
        }
    }

    async with httpx.AsyncClient() as client:
        try:
            response = await client.post(API_URL, json=payload, timeout=30.0)
            response.raise_for_status()
            data = response.json()

            if "items" in data and len(data["items"]) > 0:
                return data["items"]
            return None
        except Exception as e:
            print(f"API scraping error: {e}")
            return None

def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--remote-debugging-port=9222")
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    chrome_options.set_capability("browserName", "chrome")
    chrome_options.set_capability("platformName", "linux")

    driver = webdriver.Remote(
        command_executor="http://selenium-hub:4444/wd/hub",
        options=chrome_options
    )
    return driver

def scrape_characteristics(url: str) -> Dict:
    driver = None
    try:
        driver = get_driver()
        driver.get(url)

        # Wait for the page to load initial content
        wait = WebDriverWait(driver, 20)

        # Wait for the tab to be clickable
        wait = WebDriverWait(driver, 15)
        # Try to find all tabs by the specified class
        tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
        if len(tabs) >= 2:
            # Use execute_script for a more reliable click on dynamic elements
            driver.execute_script("arguments[0].click();", tabs[1])
            time.sleep(5)

        # Wait for product-characteristic elements
        try:
            wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
        except:
            print("Timed out waiting for characteristics elements")

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        characteristics = {}

        # Strategy: find all rows that look like key-value pairs
        rows = soup.find_all(class_="product-characteristic")
        for row in rows:
            name_el = row.find(class_="product-characteristic__name")
            value_el = row.find(class_="product-characteristic__value")

            if name_el and value_el:
                l_text = name_el.get_text(strip=True)
                v_text = value_el.get_text(strip=True)
                if l_text:
                    characteristics[l_text] = v_text

        return characteristics
    except Exception as e:
        print(f"Selenium error: {e}")
        return {"error": str(e)}
    finally:
        if driver:
            driver.quit()

@app.get("/details/{registry_number:path}")
async def get_product_details(registry_number: str):
    registry_number = registry_number.strip()

    results = await fetch_gisp_data(registry_number)
    if not results:
        raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")

    target_item = results[0]
    product_gisp_url = target_item.get("product_gisp_url", "")

    if not product_gisp_url:
        return {
            "registry_number": registry_number,
            "basic_info": target_item,
            "characteristics": {"error": "No product catalog URL found"}
        }

    # Run Selenium in a thread pool to avoid blocking the event loop
    loop = asyncio.get_event_loop()
    characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)

    return {
        "registry_number": registry_number,
        "product_name": target_item.get("product_name"),
        "manufacturer": {
            "name": target_item.get("org_name"),
            "inn": target_item.get("org_inn")
        },
        "technical_info": {
            "okpd2": target_item.get("product_okpd2"),
            "tnved": target_item.get("product_tnved")
        },
        "characteristics": characteristics,
        "raw_data": target_item
    }

@app.get("/health")
def health():
    return {"status": "ok"}