Add docker-compose and update Selenium scraping logic with target selectors

This commit is contained in:
Flash
2026-04-10 19:56:19 +00:00
parent 75f51121ea
commit 1b62fe732c
5 changed files with 211 additions and 81 deletions

View File

@@ -1,23 +1,26 @@
import httpx
import re
import os
import time
import asyncio
from fastapi import FastAPI, HTTPException
from typing import List, Optional, Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
app = FastAPI(title="GISP Scraper API (API-Direct)")
app = FastAPI(title="GISP Scraper API (Selenium Mode)")
# The API endpoint identified from network inspection
# Configuration
API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"
async def fetch_gisp_data(registry_number: str):
# Constructing the filter payload based on what we saw in the Network tab
# We will remove the restrictive date filters (res_valid_till, etc.)
payload = {
"opt": {
"sort": None,
"requireTotalCount": True,
"searchOperation": "contains",
"searchValue": None,
"skip": 0,
"take": 10,
"userData": {},
"filter": ["product_reg_number_2023", "contains", registry_number]
}
}
@@ -28,24 +31,115 @@ async def fetch_gisp_data(registry_number: str):
response.raise_for_status()
data = response.json()
# GISP usually returns { "data": [ ... ], "totalCount": N }
if "data" in data and len(data["data"]) > 0:
# Return the URL or specific entry found
# Based on the DevExtreme schema, we might need a specific ID to form the URL
return data["data"]
if "items" in data and len(data["items"]) > 0:
return data["items"]
return None
except Exception as e:
print(f"API scraping error: {e}")
return None
@app.get("/scrape/{registry_number:path}")
async def get_product_link(registry_number: str):
def get_driver():
chrome_options = Options()
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
chrome_options.set_capability("browserName", "chrome")
chrome_options.set_capability("platformName", "linux")
driver = webdriver.Remote(
command_executor="http://selenium-hub:4444/wd/hub",
options=chrome_options
)
return driver
def scrape_characteristics(url: str) -> Dict:
driver = None
try:
driver = get_driver()
driver.get(url)
# Wait for the page to load initial content
wait = WebDriverWait(driver, 20)
# Wait for the tab to be clickable
wait = WebDriverWait(driver, 15)
# Try to find all tabs by the specified class
tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
if len(tabs) >= 2:
# Use execute_script for a more reliable click on dynamic elements
driver.execute_script("arguments[0].click();", tabs[1])
time.sleep(5)
# Wait for product-characteristic elements
try:
wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
except:
print("Timed out waiting for characteristics elements")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
characteristics = {}
# Strategy: find all rows that look like key-value pairs
rows = soup.find_all(class_="product-characteristic")
for row in rows:
name_el = row.find(class_="product-characteristic__name")
value_el = row.find(class_="product-characteristic__value")
if name_el and value_el:
l_text = name_el.get_text(strip=True)
v_text = value_el.get_text(strip=True)
if l_text:
characteristics[l_text] = v_text
return characteristics
except Exception as e:
print(f"Selenium error: {e}")
return {"error": str(e)}
finally:
if driver:
driver.quit()
@app.get("/details/{registry_number:path}")
async def get_product_details(registry_number: str):
registry_number = registry_number.strip()
results = await fetch_gisp_data(registry_number)
if not results:
raise HTTPException(status_code=404, detail="Product not found or scraping failed")
raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")
# We can refine this to return the specific URL or the whole result object
return {"registry_number": registry_number, "results": results}
target_item = results[0]
product_gisp_url = target_item.get("product_gisp_url", "")
if not product_gisp_url:
return {
"registry_number": registry_number,
"basic_info": target_item,
"characteristics": {"error": "No product catalog URL found"}
}
# Run Selenium in a thread pool to avoid blocking the event loop
loop = asyncio.get_event_loop()
characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)
return {
"registry_number": registry_number,
"product_name": target_item.get("product_name"),
"manufacturer": {
"name": target_item.get("org_name"),
"inn": target_item.get("org_inn")
},
"technical_info": {
"okpd2": target_item.get("product_okpd2"),
"tnved": target_item.get("product_tnved")
},
"characteristics": characteristics,
"raw_data": target_item
}
@app.get("/health")
def health():

29
docker-compose.yml Normal file
View File

@@ -0,0 +1,29 @@
version: '3.8'
services:
gisp-scraper:
build: .
ports:
- "8000:8000"
environment:
- SELENIUM_HUB_URL=http://selenium-hub:4444
depends_on:
- selenium-hub
- selenium-node-chrome
selenium-hub:
image: selenium/hub:4.16.1
ports:
- "4444:4444"
selenium-node-chrome:
image: selenium/node-chrome:4.16.1
environment:
- SE_EVENT_BUS_HOST=selenium-hub
- SE_EVENT_BUS_PUBLISH_PORT=4442
- SE_EVENT_BUS_SUBSCRIBE_PORT=4443
- SHM_SIZE=2g
volumes:
- /dev/shm:/dev/shm
depends_on:
- selenium-hub

18
ingress.yaml Normal file
View File

@@ -0,0 +1,18 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: gisp-scraper-ingress
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: web
spec:
rules:
- host: gisp-scraper.danilkolesnikov.ru
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: gisp-scraper
port:
number: 8000

107
k8s.yaml
View File

@@ -1,5 +1,44 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: gisp-scraper
labels:
app: gisp-scraper
spec:
replicas: 1
selector:
matchLabels:
app: gisp-scraper
template:
metadata:
labels:
app: gisp-scraper
spec:
containers:
- name: gisp-scraper
image: git.danilkolesnikov.ru/flash/gisp-scraper:latest
env:
- name: SELENIUM_HUB_URL
value: "http://selenium-hub:4444/wd/hub"
ports:
- containerPort: 8000
---
apiVersion: v1
kind: Service
metadata:
name: gisp-scraper
spec:
selector:
app: gisp-scraper
ports:
- protocol: TCP
port: 8000
targetPort: 8000
nodePort: 30001
type: NodePort
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: selenium-hub
spec:
@@ -12,13 +51,9 @@ spec:
labels:
app: selenium-hub
spec:
tolerations:
- key: "node.kubernetes.io/disk-pressure"
operator: "Exists"
effect: "NoSchedule"
containers:
- name: selenium-hub
image: selenium/hub:latest
image: selenium/hub:4.16.1
ports:
- containerPort: 4444
---
@@ -30,15 +65,9 @@ spec:
selector:
app: selenium-hub
ports:
- port: 4444
name: http
targetPort: 4444
- port: 4442
name: publish
targetPort: 4442
- port: 4443
name: subscribe
targetPort: 4443
- protocol: TCP
port: 4444
targetPort: 4444
---
apiVersion: apps/v1
kind: Deployment
@@ -54,20 +83,18 @@ spec:
labels:
app: selenium-node-chrome
spec:
tolerations:
- key: "node.kubernetes.io/disk-pressure"
operator: "Exists"
effect: "NoSchedule"
containers:
- name: selenium-node-chrome
image: selenium/node-chrome:latest
image: selenium/node-chrome:4.16.1
env:
- name: SE_EVENT_BUS_HOST
value: "selenium-hub"
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: "4442"
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: "4443"
- name: SHM_SIZE
value: "2g"
volumeMounts:
- name: dshm
mountPath: /dev/shm
@@ -75,43 +102,3 @@ spec:
- name: dshm
emptyDir:
medium: Memory
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: gisp-scraper
spec:
replicas: 1
selector:
matchLabels:
app: gisp-scraper
template:
metadata:
labels:
app: gisp-scraper
spec:
tolerations:
- key: "node.kubernetes.io/disk-pressure"
operator: "Exists"
effect: "NoSchedule"
containers:
- name: gisp-scraper
image: git.danilkolesnikov.ru/flash/gisp-scraper:latest
ports:
- containerPort: 8000
env:
- name: SELENIUM_HUB_URL
value: "http://selenium-hub:4444/wd/hub"
---
apiVersion: v1
kind: Service
metadata:
name: gisp-scraper
spec:
selector:
app: gisp-scraper
ports:
- protocol: TCP
port: 80
targetPort: 8000
type: ClusterIP

View File

@@ -1,3 +1,5 @@
fastapi
uvicorn
httpx
selenium
beautifulsoup4