import httpx import re import os import time import asyncio from fastapi import FastAPI, HTTPException from typing import List, Optional, Dict from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from bs4 import BeautifulSoup app = FastAPI(title="GISP Scraper API (Selenium Mode)") # Configuration API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/" SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub" async def fetch_gisp_data(registry_number: str): payload = { "opt": { "filter": ["product_reg_number_2023", "contains", registry_number] } } async with httpx.AsyncClient() as client: try: response = await client.post(API_URL, json=payload, timeout=30.0) response.raise_for_status() data = response.json() if "items" in data and len(data["items"]) > 0: return data["items"] return None except Exception as e: print(f"API scraping error: {e}") return None def get_driver(): chrome_options = Options() chrome_options.add_argument("--remote-debugging-port=9222") chrome_options.add_argument("--headless=new") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") chrome_options.set_capability("browserName", "chrome") chrome_options.set_capability("platformName", "linux") driver = webdriver.Remote( command_executor="http://selenium-hub:4444/wd/hub", options=chrome_options ) return driver def scrape_characteristics(url: str) -> Dict: driver = None try: driver = get_driver() driver.get(url) # Wait for the page to load initial content wait = WebDriverWait(driver, 20) # Wait for the tab to be clickable wait = WebDriverWait(driver, 15) # Try to find all tabs by the specified class tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab") if len(tabs) >= 2: # Use execute_script for a more reliable click on dynamic elements driver.execute_script("arguments[0].click();", tabs[1]) time.sleep(5) # Wait for product-characteristic elements try: wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic"))) except: print("Timed out waiting for characteristics elements") html = driver.page_source soup = BeautifulSoup(html, 'html.parser') characteristics = {} # Strategy: find all rows that look like key-value pairs rows = soup.find_all(class_="product-characteristic") for row in rows: name_el = row.find(class_="product-characteristic__name") value_el = row.find(class_="product-characteristic__value") if name_el and value_el: l_text = name_el.get_text(strip=True) v_text = value_el.get_text(strip=True) if l_text: characteristics[l_text] = v_text return characteristics except Exception as e: print(f"Selenium error: {e}") return {"error": str(e)} finally: if driver: driver.quit() @app.get("/details/{registry_number:path}") async def get_product_details(registry_number: str): registry_number = registry_number.strip() results = await fetch_gisp_data(registry_number) if not results: raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found") target_item = results[0] product_gisp_url = target_item.get("product_gisp_url", "") if not product_gisp_url: return { "registry_number": registry_number, "basic_info": target_item, "characteristics": {"error": "No product catalog URL found"} } # Run Selenium in a thread pool to avoid blocking the event loop loop = asyncio.get_event_loop() characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url) return { "registry_number": registry_number, "product_name": target_item.get("product_name"), "manufacturer": { "name": target_item.get("org_name"), "inn": target_item.get("org_inn") }, "technical_info": { "okpd2": target_item.get("product_okpd2"), "tnved": target_item.get("product_tnved") }, "characteristics": characteristics, "raw_data": target_item } @app.get("/health") def health(): return {"status": "ok"}