Add docker-compose and update Selenium scraping logic with target selectors

2026-04-10 19:56:19 +00:00
parent 75f51121ea
commit 1b62fe732c
5 changed files with 211 additions and 81 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -1,23 +1,26 @@
 import httpx
+import re
+import os
+import time
+import asyncio
 from fastapi import FastAPI, HTTPException
+from typing import List, Optional, Dict
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup

-app = FastAPI(title="GISP Scraper API (API-Direct)")
+app = FastAPI(title="GISP Scraper API (Selenium Mode)")

-# The API endpoint identified from network inspection
+# Configuration
 API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
+SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"

 async def fetch_gisp_data(registry_number: str):
-    # Constructing the filter payload based on what we saw in the Network tab
-    # We will remove the restrictive date filters (res_valid_till, etc.)
    payload = {
        "opt": {
-            "sort": None,
-            "requireTotalCount": True,
-            "searchOperation": "contains",
-            "searchValue": None,
-            "skip": 0,
-            "take": 10,
-            "userData": {},
            "filter": ["product_reg_number_2023", "contains", registry_number]
        }
    }
@@ -28,24 +31,115 @@ async def fetch_gisp_data(registry_number: str):
            response.raise_for_status()
            data = response.json()
            
-            # GISP usually returns { "data": [ ... ], "totalCount": N }
-            if "data" in data and len(data["data"]) > 0:
-                # Return the URL or specific entry found
-                # Based on the DevExtreme schema, we might need a specific ID to form the URL
-                return data["data"]
+            if "items" in data and len(data["items"]) > 0:
+                return data["items"]
            return None
        except Exception as e:
            print(f"API scraping error: {e}")
            return None

-@app.get("/scrape/{registry_number:path}")
-async def get_product_link(registry_number: str):
+def get_driver():
+    chrome_options = Options()
+    chrome_options.add_argument("--remote-debugging-port=9222")
+    chrome_options.add_argument("--headless=new")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+    chrome_options.set_capability("browserName", "chrome")
+    chrome_options.set_capability("platformName", "linux")
+    
+    driver = webdriver.Remote(
+        command_executor="http://selenium-hub:4444/wd/hub",
+        options=chrome_options
+    )
+    return driver
+
+def scrape_characteristics(url: str) -> Dict:
+    driver = None
+    try:
+        driver = get_driver()
+        driver.get(url)
+        
+        # Wait for the page to load initial content
+        wait = WebDriverWait(driver, 20)
+        
+        # Wait for the tab to be clickable
+        wait = WebDriverWait(driver, 15)
+        # Try to find all tabs by the specified class
+        tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
+        if len(tabs) >= 2:
+            # Use execute_script for a more reliable click on dynamic elements
+            driver.execute_script("arguments[0].click();", tabs[1])
+            time.sleep(5) 
+        
+        # Wait for product-characteristic elements
+        try:
+            wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
+        except:
+            print("Timed out waiting for characteristics elements")
+        
+        html = driver.page_source
+        soup = BeautifulSoup(html, 'html.parser')
+        
+        characteristics = {}
+        
+        # Strategy: find all rows that look like key-value pairs
+        rows = soup.find_all(class_="product-characteristic")
+        for row in rows:
+            name_el = row.find(class_="product-characteristic__name")
+            value_el = row.find(class_="product-characteristic__value")
+            
+            if name_el and value_el:
+                l_text = name_el.get_text(strip=True)
+                v_text = value_el.get_text(strip=True)
+                if l_text:
+                    characteristics[l_text] = v_text
+
+        return characteristics
+    except Exception as e:
+        print(f"Selenium error: {e}")
+        return {"error": str(e)}
+    finally:
+        if driver:
+            driver.quit()
+
+@app.get("/details/{registry_number:path}")
+async def get_product_details(registry_number: str):
+    registry_number = registry_number.strip()
+    
    results = await fetch_gisp_data(registry_number)
    if not results:
-        raise HTTPException(status_code=404, detail="Product not found or scraping failed")
+        raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")
    
-    # We can refine this to return the specific URL or the whole result object
-    return {"registry_number": registry_number, "results": results}
+    target_item = results[0]
+    product_gisp_url = target_item.get("product_gisp_url", "")
+    
+    if not product_gisp_url:
+        return {
+            "registry_number": registry_number,
+            "basic_info": target_item,
+            "characteristics": {"error": "No product catalog URL found"}
+        }
+
+    # Run Selenium in a thread pool to avoid blocking the event loop
+    loop = asyncio.get_event_loop()
+    characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)
+    
+    return {
+        "registry_number": registry_number,
+        "product_name": target_item.get("product_name"),
+        "manufacturer": {
+            "name": target_item.get("org_name"),
+            "inn": target_item.get("org_inn")
+        },
+        "technical_info": {
+            "okpd2": target_item.get("product_okpd2"),
+            "tnved": target_item.get("product_tnved")
+        },
+        "characteristics": characteristics,
+        "raw_data": target_item
+    }

@app.get("/health")
 def health():
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,29 @@
+version: '3.8'
+
+services:
+  gisp-scraper:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - SELENIUM_HUB_URL=http://selenium-hub:4444
+    depends_on:
+      - selenium-hub
+      - selenium-node-chrome
+
+  selenium-hub:
+    image: selenium/hub:4.16.1
+    ports:
+      - "4444:4444"
+
+  selenium-node-chrome:
+    image: selenium/node-chrome:4.16.1
+    environment:
+      - SE_EVENT_BUS_HOST=selenium-hub
+      - SE_EVENT_BUS_PUBLISH_PORT=4442
+      - SE_EVENT_BUS_SUBSCRIBE_PORT=4443
+      - SHM_SIZE=2g
+    volumes:
+      - /dev/shm:/dev/shm
+    depends_on:
+      - selenium-hub
--- a/ingress.yaml
+++ b/ingress.yaml
@@ -0,0 +1,18 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: gisp-scraper-ingress
+  annotations:
+    traefik.ingress.kubernetes.io/router.entrypoints: web
+spec:
+  rules:
+  - host: gisp-scraper.danilkolesnikov.ru
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: gisp-scraper
+            port:
+              number: 8000
--- a/k8s.yaml
+++ b/k8s.yaml
@@ -1,5 +1,44 @@
 apiVersion: apps/v1
 kind: Deployment
+metadata:
+  name: gisp-scraper
+  labels:
+    app: gisp-scraper
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gisp-scraper
+  template:
+    metadata:
+      labels:
+        app: gisp-scraper
+    spec:
+      containers:
+      - name: gisp-scraper
+        image: git.danilkolesnikov.ru/flash/gisp-scraper:latest
+        env:
+        - name: SELENIUM_HUB_URL
+          value: "http://selenium-hub:4444/wd/hub"
+        ports:
+        - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gisp-scraper
+spec:
+  selector:
+    app: gisp-scraper
+  ports:
+    - protocol: TCP
+      port: 8000
+      targetPort: 8000
+      nodePort: 30001
+  type: NodePort
+---
+apiVersion: apps/v1
+kind: Deployment
 metadata:
  name: selenium-hub
 spec:
@@ -12,13 +51,9 @@ spec:
      labels:
        app: selenium-hub
    spec:
-      tolerations:
-      - key: "node.kubernetes.io/disk-pressure"
-        operator: "Exists"
-        effect: "NoSchedule"
      containers:
      - name: selenium-hub
-        image: selenium/hub:latest
+        image: selenium/hub:4.16.1
        ports:
        - containerPort: 4444
 ---
@@ -30,15 +65,9 @@ spec:
  selector:
    app: selenium-hub
  ports:
-  - port: 4444
-    name: http
-    targetPort: 4444
-  - port: 4442
-    name: publish
-    targetPort: 4442
-  - port: 4443
-    name: subscribe
-    targetPort: 4443
+    - protocol: TCP
+      port: 4444
+      targetPort: 4444
 ---
 apiVersion: apps/v1
 kind: Deployment
@@ -54,20 +83,18 @@ spec:
      labels:
        app: selenium-node-chrome
    spec:
-      tolerations:
-      - key: "node.kubernetes.io/disk-pressure"
-        operator: "Exists"
-        effect: "NoSchedule"
      containers:
      - name: selenium-node-chrome
-        image: selenium/node-chrome:latest
+        image: selenium/node-chrome:4.16.1
        env:
        - name: SE_EVENT_BUS_HOST
-          value: "selenium-hub"
+          value: selenium-hub
        - name: SE_EVENT_BUS_PUBLISH_PORT
          value: "4442"
        - name: SE_EVENT_BUS_SUBSCRIBE_PORT
          value: "4443"
+        - name: SHM_SIZE
+          value: "2g"
        volumeMounts:
        - name: dshm
          mountPath: /dev/shm
@@ -75,43 +102,3 @@ spec:
      - name: dshm
        emptyDir:
          medium: Memory
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: gisp-scraper
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: gisp-scraper
-  template:
-    metadata:
-      labels:
-        app: gisp-scraper
-    spec:
-      tolerations:
-      - key: "node.kubernetes.io/disk-pressure"
-        operator: "Exists"
-        effect: "NoSchedule"
-      containers:
-      - name: gisp-scraper
-        image: git.danilkolesnikov.ru/flash/gisp-scraper:latest
-        ports:
-        - containerPort: 8000
-        env:
-        - name: SELENIUM_HUB_URL
-          value: "http://selenium-hub:4444/wd/hub"
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: gisp-scraper
-spec:
-  selector:
-    app: gisp-scraper
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 8000
-  type: ClusterIP
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
 fastapi
 uvicorn
 httpx
+selenium
+beautifulsoup4