diff --git a/app/main.py b/app/main.py index d2849cf..739385a 100644 --- a/app/main.py +++ b/app/main.py @@ -1,23 +1,26 @@ import httpx +import re +import os +import time +import asyncio from fastapi import FastAPI, HTTPException +from typing import List, Optional, Dict +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from bs4 import BeautifulSoup -app = FastAPI(title="GISP Scraper API (API-Direct)") +app = FastAPI(title="GISP Scraper API (Selenium Mode)") -# The API endpoint identified from network inspection +# Configuration API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/" +SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub" async def fetch_gisp_data(registry_number: str): - # Constructing the filter payload based on what we saw in the Network tab - # We will remove the restrictive date filters (res_valid_till, etc.) payload = { "opt": { - "sort": None, - "requireTotalCount": True, - "searchOperation": "contains", - "searchValue": None, - "skip": 0, - "take": 10, - "userData": {}, "filter": ["product_reg_number_2023", "contains", registry_number] } } @@ -28,24 +31,115 @@ async def fetch_gisp_data(registry_number: str): response.raise_for_status() data = response.json() - # GISP usually returns { "data": [ ... ], "totalCount": N } - if "data" in data and len(data["data"]) > 0: - # Return the URL or specific entry found - # Based on the DevExtreme schema, we might need a specific ID to form the URL - return data["data"] + if "items" in data and len(data["items"]) > 0: + return data["items"] return None except Exception as e: print(f"API scraping error: {e}") return None -@app.get("/scrape/{registry_number:path}") -async def get_product_link(registry_number: str): +def get_driver(): + chrome_options = Options() + chrome_options.add_argument("--remote-debugging-port=9222") + chrome_options.add_argument("--headless=new") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--window-size=1920,1080") + chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + chrome_options.set_capability("browserName", "chrome") + chrome_options.set_capability("platformName", "linux") + + driver = webdriver.Remote( + command_executor="http://selenium-hub:4444/wd/hub", + options=chrome_options + ) + return driver + +def scrape_characteristics(url: str) -> Dict: + driver = None + try: + driver = get_driver() + driver.get(url) + + # Wait for the page to load initial content + wait = WebDriverWait(driver, 20) + + # Wait for the tab to be clickable + wait = WebDriverWait(driver, 15) + # Try to find all tabs by the specified class + tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab") + if len(tabs) >= 2: + # Use execute_script for a more reliable click on dynamic elements + driver.execute_script("arguments[0].click();", tabs[1]) + time.sleep(5) + + # Wait for product-characteristic elements + try: + wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic"))) + except: + print("Timed out waiting for characteristics elements") + + html = driver.page_source + soup = BeautifulSoup(html, 'html.parser') + + characteristics = {} + + # Strategy: find all rows that look like key-value pairs + rows = soup.find_all(class_="product-characteristic") + for row in rows: + name_el = row.find(class_="product-characteristic__name") + value_el = row.find(class_="product-characteristic__value") + + if name_el and value_el: + l_text = name_el.get_text(strip=True) + v_text = value_el.get_text(strip=True) + if l_text: + characteristics[l_text] = v_text + + return characteristics + except Exception as e: + print(f"Selenium error: {e}") + return {"error": str(e)} + finally: + if driver: + driver.quit() + +@app.get("/details/{registry_number:path}") +async def get_product_details(registry_number: str): + registry_number = registry_number.strip() + results = await fetch_gisp_data(registry_number) if not results: - raise HTTPException(status_code=404, detail="Product not found or scraping failed") + raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found") - # We can refine this to return the specific URL or the whole result object - return {"registry_number": registry_number, "results": results} + target_item = results[0] + product_gisp_url = target_item.get("product_gisp_url", "") + + if not product_gisp_url: + return { + "registry_number": registry_number, + "basic_info": target_item, + "characteristics": {"error": "No product catalog URL found"} + } + + # Run Selenium in a thread pool to avoid blocking the event loop + loop = asyncio.get_event_loop() + characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url) + + return { + "registry_number": registry_number, + "product_name": target_item.get("product_name"), + "manufacturer": { + "name": target_item.get("org_name"), + "inn": target_item.get("org_inn") + }, + "technical_info": { + "okpd2": target_item.get("product_okpd2"), + "tnved": target_item.get("product_tnved") + }, + "characteristics": characteristics, + "raw_data": target_item + } @app.get("/health") def health(): diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c6619d9 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,29 @@ +version: '3.8' + +services: + gisp-scraper: + build: . + ports: + - "8000:8000" + environment: + - SELENIUM_HUB_URL=http://selenium-hub:4444 + depends_on: + - selenium-hub + - selenium-node-chrome + + selenium-hub: + image: selenium/hub:4.16.1 + ports: + - "4444:4444" + + selenium-node-chrome: + image: selenium/node-chrome:4.16.1 + environment: + - SE_EVENT_BUS_HOST=selenium-hub + - SE_EVENT_BUS_PUBLISH_PORT=4442 + - SE_EVENT_BUS_SUBSCRIBE_PORT=4443 + - SHM_SIZE=2g + volumes: + - /dev/shm:/dev/shm + depends_on: + - selenium-hub diff --git a/ingress.yaml b/ingress.yaml new file mode 100644 index 0000000..e997ed3 --- /dev/null +++ b/ingress.yaml @@ -0,0 +1,18 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: gisp-scraper-ingress + annotations: + traefik.ingress.kubernetes.io/router.entrypoints: web +spec: + rules: + - host: gisp-scraper.danilkolesnikov.ru + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: gisp-scraper + port: + number: 8000 diff --git a/k8s.yaml b/k8s.yaml index eb2373a..27baa00 100644 --- a/k8s.yaml +++ b/k8s.yaml @@ -1,5 +1,44 @@ apiVersion: apps/v1 kind: Deployment +metadata: + name: gisp-scraper + labels: + app: gisp-scraper +spec: + replicas: 1 + selector: + matchLabels: + app: gisp-scraper + template: + metadata: + labels: + app: gisp-scraper + spec: + containers: + - name: gisp-scraper + image: git.danilkolesnikov.ru/flash/gisp-scraper:latest + env: + - name: SELENIUM_HUB_URL + value: "http://selenium-hub:4444/wd/hub" + ports: + - containerPort: 8000 +--- +apiVersion: v1 +kind: Service +metadata: + name: gisp-scraper +spec: + selector: + app: gisp-scraper + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + nodePort: 30001 + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment metadata: name: selenium-hub spec: @@ -12,13 +51,9 @@ spec: labels: app: selenium-hub spec: - tolerations: - - key: "node.kubernetes.io/disk-pressure" - operator: "Exists" - effect: "NoSchedule" containers: - name: selenium-hub - image: selenium/hub:latest + image: selenium/hub:4.16.1 ports: - containerPort: 4444 --- @@ -30,15 +65,9 @@ spec: selector: app: selenium-hub ports: - - port: 4444 - name: http - targetPort: 4444 - - port: 4442 - name: publish - targetPort: 4442 - - port: 4443 - name: subscribe - targetPort: 4443 + - protocol: TCP + port: 4444 + targetPort: 4444 --- apiVersion: apps/v1 kind: Deployment @@ -54,20 +83,18 @@ spec: labels: app: selenium-node-chrome spec: - tolerations: - - key: "node.kubernetes.io/disk-pressure" - operator: "Exists" - effect: "NoSchedule" containers: - name: selenium-node-chrome - image: selenium/node-chrome:latest + image: selenium/node-chrome:4.16.1 env: - name: SE_EVENT_BUS_HOST - value: "selenium-hub" + value: selenium-hub - name: SE_EVENT_BUS_PUBLISH_PORT value: "4442" - name: SE_EVENT_BUS_SUBSCRIBE_PORT value: "4443" + - name: SHM_SIZE + value: "2g" volumeMounts: - name: dshm mountPath: /dev/shm @@ -75,43 +102,3 @@ spec: - name: dshm emptyDir: medium: Memory ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: gisp-scraper -spec: - replicas: 1 - selector: - matchLabels: - app: gisp-scraper - template: - metadata: - labels: - app: gisp-scraper - spec: - tolerations: - - key: "node.kubernetes.io/disk-pressure" - operator: "Exists" - effect: "NoSchedule" - containers: - - name: gisp-scraper - image: git.danilkolesnikov.ru/flash/gisp-scraper:latest - ports: - - containerPort: 8000 - env: - - name: SELENIUM_HUB_URL - value: "http://selenium-hub:4444/wd/hub" ---- -apiVersion: v1 -kind: Service -metadata: - name: gisp-scraper -spec: - selector: - app: gisp-scraper - ports: - - protocol: TCP - port: 80 - targetPort: 8000 - type: ClusterIP diff --git a/requirements.txt b/requirements.txt index d23d558..108b5c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ fastapi uvicorn httpx +selenium +beautifulsoup4