Add docker-compose and update Selenium scraping logic with target selectors

2026-04-10 19:56:19 +00:00
parent 75f51121ea
commit 1b62fe732c
5 changed files with 211 additions and 81 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -1,23 +1,26 @@
 import httpx
+import re
+import os
+import time
+import asyncio
 from fastapi import FastAPI, HTTPException
+from typing import List, Optional, Dict
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup

-app = FastAPI(title="GISP Scraper API (API-Direct)")
+app = FastAPI(title="GISP Scraper API (Selenium Mode)")

-# The API endpoint identified from network inspection
+# Configuration
 API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
+SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"

 async def fetch_gisp_data(registry_number: str):
-    # Constructing the filter payload based on what we saw in the Network tab
-    # We will remove the restrictive date filters (res_valid_till, etc.)
    payload = {
        "opt": {
-            "sort": None,
-            "requireTotalCount": True,
-            "searchOperation": "contains",
-            "searchValue": None,
-            "skip": 0,
-            "take": 10,
-            "userData": {},
            "filter": ["product_reg_number_2023", "contains", registry_number]
        }
    }
@@ -28,24 +31,115 @@ async def fetch_gisp_data(registry_number: str):
            response.raise_for_status()
            data = response.json()
            
-            # GISP usually returns { "data": [ ... ], "totalCount": N }
-            if "data" in data and len(data["data"]) > 0:
-                # Return the URL or specific entry found
-                # Based on the DevExtreme schema, we might need a specific ID to form the URL
-                return data["data"]
+            if "items" in data and len(data["items"]) > 0:
+                return data["items"]
            return None
        except Exception as e:
            print(f"API scraping error: {e}")
            return None

-@app.get("/scrape/{registry_number:path}")
-async def get_product_link(registry_number: str):
+def get_driver():
+    chrome_options = Options()
+    chrome_options.add_argument("--remote-debugging-port=9222")
+    chrome_options.add_argument("--headless=new")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+    chrome_options.set_capability("browserName", "chrome")
+    chrome_options.set_capability("platformName", "linux")
+    
+    driver = webdriver.Remote(
+        command_executor="http://selenium-hub:4444/wd/hub",
+        options=chrome_options
+    )
+    return driver
+
+def scrape_characteristics(url: str) -> Dict:
+    driver = None
+    try:
+        driver = get_driver()
+        driver.get(url)
+        
+        # Wait for the page to load initial content
+        wait = WebDriverWait(driver, 20)
+        
+        # Wait for the tab to be clickable
+        wait = WebDriverWait(driver, 15)
+        # Try to find all tabs by the specified class
+        tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
+        if len(tabs) >= 2:
+            # Use execute_script for a more reliable click on dynamic elements
+            driver.execute_script("arguments[0].click();", tabs[1])
+            time.sleep(5) 
+        
+        # Wait for product-characteristic elements
+        try:
+            wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
+        except:
+            print("Timed out waiting for characteristics elements")
+        
+        html = driver.page_source
+        soup = BeautifulSoup(html, 'html.parser')
+        
+        characteristics = {}
+        
+        # Strategy: find all rows that look like key-value pairs
+        rows = soup.find_all(class_="product-characteristic")
+        for row in rows:
+            name_el = row.find(class_="product-characteristic__name")
+            value_el = row.find(class_="product-characteristic__value")
+            
+            if name_el and value_el:
+                l_text = name_el.get_text(strip=True)
+                v_text = value_el.get_text(strip=True)
+                if l_text:
+                    characteristics[l_text] = v_text
+
+        return characteristics
+    except Exception as e:
+        print(f"Selenium error: {e}")
+        return {"error": str(e)}
+    finally:
+        if driver:
+            driver.quit()
+
+@app.get("/details/{registry_number:path}")
+async def get_product_details(registry_number: str):
+    registry_number = registry_number.strip()
+    
    results = await fetch_gisp_data(registry_number)
    if not results:
-        raise HTTPException(status_code=404, detail="Product not found or scraping failed")
+        raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")
    
-    # We can refine this to return the specific URL or the whole result object
-    return {"registry_number": registry_number, "results": results}
+    target_item = results[0]
+    product_gisp_url = target_item.get("product_gisp_url", "")
+    
+    if not product_gisp_url:
+        return {
+            "registry_number": registry_number,
+            "basic_info": target_item,
+            "characteristics": {"error": "No product catalog URL found"}
+        }
+
+    # Run Selenium in a thread pool to avoid blocking the event loop
+    loop = asyncio.get_event_loop()
+    characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)
+    
+    return {
+        "registry_number": registry_number,
+        "product_name": target_item.get("product_name"),
+        "manufacturer": {
+            "name": target_item.get("org_name"),
+            "inn": target_item.get("org_inn")
+        },
+        "technical_info": {
+            "okpd2": target_item.get("product_okpd2"),
+            "tnved": target_item.get("product_tnved")
+        },
+        "characteristics": characteristics,
+        "raw_data": target_item
+    }

@app.get("/health")
 def health():