Add docker-compose and update Selenium scraping logic with target selectors
This commit is contained in:
136
app/main.py
136
app/main.py
@@ -1,23 +1,26 @@
|
||||
import httpx
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
import asyncio
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from typing import List, Optional, Dict
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
app = FastAPI(title="GISP Scraper API (API-Direct)")
|
||||
app = FastAPI(title="GISP Scraper API (Selenium Mode)")
|
||||
|
||||
# The API endpoint identified from network inspection
|
||||
# Configuration
|
||||
API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
|
||||
SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"
|
||||
|
||||
async def fetch_gisp_data(registry_number: str):
|
||||
# Constructing the filter payload based on what we saw in the Network tab
|
||||
# We will remove the restrictive date filters (res_valid_till, etc.)
|
||||
payload = {
|
||||
"opt": {
|
||||
"sort": None,
|
||||
"requireTotalCount": True,
|
||||
"searchOperation": "contains",
|
||||
"searchValue": None,
|
||||
"skip": 0,
|
||||
"take": 10,
|
||||
"userData": {},
|
||||
"filter": ["product_reg_number_2023", "contains", registry_number]
|
||||
}
|
||||
}
|
||||
@@ -28,24 +31,115 @@ async def fetch_gisp_data(registry_number: str):
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# GISP usually returns { "data": [ ... ], "totalCount": N }
|
||||
if "data" in data and len(data["data"]) > 0:
|
||||
# Return the URL or specific entry found
|
||||
# Based on the DevExtreme schema, we might need a specific ID to form the URL
|
||||
return data["data"]
|
||||
if "items" in data and len(data["items"]) > 0:
|
||||
return data["items"]
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"API scraping error: {e}")
|
||||
return None
|
||||
|
||||
@app.get("/scrape/{registry_number:path}")
|
||||
async def get_product_link(registry_number: str):
|
||||
def get_driver():
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--remote-debugging-port=9222")
|
||||
chrome_options.add_argument("--headless=new")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--window-size=1920,1080")
|
||||
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
chrome_options.set_capability("browserName", "chrome")
|
||||
chrome_options.set_capability("platformName", "linux")
|
||||
|
||||
driver = webdriver.Remote(
|
||||
command_executor="http://selenium-hub:4444/wd/hub",
|
||||
options=chrome_options
|
||||
)
|
||||
return driver
|
||||
|
||||
def scrape_characteristics(url: str) -> Dict:
|
||||
driver = None
|
||||
try:
|
||||
driver = get_driver()
|
||||
driver.get(url)
|
||||
|
||||
# Wait for the page to load initial content
|
||||
wait = WebDriverWait(driver, 20)
|
||||
|
||||
# Wait for the tab to be clickable
|
||||
wait = WebDriverWait(driver, 15)
|
||||
# Try to find all tabs by the specified class
|
||||
tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
|
||||
if len(tabs) >= 2:
|
||||
# Use execute_script for a more reliable click on dynamic elements
|
||||
driver.execute_script("arguments[0].click();", tabs[1])
|
||||
time.sleep(5)
|
||||
|
||||
# Wait for product-characteristic elements
|
||||
try:
|
||||
wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
|
||||
except:
|
||||
print("Timed out waiting for characteristics elements")
|
||||
|
||||
html = driver.page_source
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
characteristics = {}
|
||||
|
||||
# Strategy: find all rows that look like key-value pairs
|
||||
rows = soup.find_all(class_="product-characteristic")
|
||||
for row in rows:
|
||||
name_el = row.find(class_="product-characteristic__name")
|
||||
value_el = row.find(class_="product-characteristic__value")
|
||||
|
||||
if name_el and value_el:
|
||||
l_text = name_el.get_text(strip=True)
|
||||
v_text = value_el.get_text(strip=True)
|
||||
if l_text:
|
||||
characteristics[l_text] = v_text
|
||||
|
||||
return characteristics
|
||||
except Exception as e:
|
||||
print(f"Selenium error: {e}")
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
if driver:
|
||||
driver.quit()
|
||||
|
||||
@app.get("/details/{registry_number:path}")
|
||||
async def get_product_details(registry_number: str):
|
||||
registry_number = registry_number.strip()
|
||||
|
||||
results = await fetch_gisp_data(registry_number)
|
||||
if not results:
|
||||
raise HTTPException(status_code=404, detail="Product not found or scraping failed")
|
||||
raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")
|
||||
|
||||
# We can refine this to return the specific URL or the whole result object
|
||||
return {"registry_number": registry_number, "results": results}
|
||||
target_item = results[0]
|
||||
product_gisp_url = target_item.get("product_gisp_url", "")
|
||||
|
||||
if not product_gisp_url:
|
||||
return {
|
||||
"registry_number": registry_number,
|
||||
"basic_info": target_item,
|
||||
"characteristics": {"error": "No product catalog URL found"}
|
||||
}
|
||||
|
||||
# Run Selenium in a thread pool to avoid blocking the event loop
|
||||
loop = asyncio.get_event_loop()
|
||||
characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)
|
||||
|
||||
return {
|
||||
"registry_number": registry_number,
|
||||
"product_name": target_item.get("product_name"),
|
||||
"manufacturer": {
|
||||
"name": target_item.get("org_name"),
|
||||
"inn": target_item.get("org_inn")
|
||||
},
|
||||
"technical_info": {
|
||||
"okpd2": target_item.get("product_okpd2"),
|
||||
"tnved": target_item.get("product_tnved")
|
||||
},
|
||||
"characteristics": characteristics,
|
||||
"raw_data": target_item
|
||||
}
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
|
||||
29
docker-compose.yml
Normal file
29
docker-compose.yml
Normal file
@@ -0,0 +1,29 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
gisp-scraper:
|
||||
build: .
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- SELENIUM_HUB_URL=http://selenium-hub:4444
|
||||
depends_on:
|
||||
- selenium-hub
|
||||
- selenium-node-chrome
|
||||
|
||||
selenium-hub:
|
||||
image: selenium/hub:4.16.1
|
||||
ports:
|
||||
- "4444:4444"
|
||||
|
||||
selenium-node-chrome:
|
||||
image: selenium/node-chrome:4.16.1
|
||||
environment:
|
||||
- SE_EVENT_BUS_HOST=selenium-hub
|
||||
- SE_EVENT_BUS_PUBLISH_PORT=4442
|
||||
- SE_EVENT_BUS_SUBSCRIBE_PORT=4443
|
||||
- SHM_SIZE=2g
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm
|
||||
depends_on:
|
||||
- selenium-hub
|
||||
18
ingress.yaml
Normal file
18
ingress.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: gisp-scraper-ingress
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: web
|
||||
spec:
|
||||
rules:
|
||||
- host: gisp-scraper.danilkolesnikov.ru
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: gisp-scraper
|
||||
port:
|
||||
number: 8000
|
||||
107
k8s.yaml
107
k8s.yaml
@@ -1,5 +1,44 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: gisp-scraper
|
||||
labels:
|
||||
app: gisp-scraper
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: gisp-scraper
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: gisp-scraper
|
||||
spec:
|
||||
containers:
|
||||
- name: gisp-scraper
|
||||
image: git.danilkolesnikov.ru/flash/gisp-scraper:latest
|
||||
env:
|
||||
- name: SELENIUM_HUB_URL
|
||||
value: "http://selenium-hub:4444/wd/hub"
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: gisp-scraper
|
||||
spec:
|
||||
selector:
|
||||
app: gisp-scraper
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
nodePort: 30001
|
||||
type: NodePort
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: selenium-hub
|
||||
spec:
|
||||
@@ -12,13 +51,9 @@ spec:
|
||||
labels:
|
||||
app: selenium-hub
|
||||
spec:
|
||||
tolerations:
|
||||
- key: "node.kubernetes.io/disk-pressure"
|
||||
operator: "Exists"
|
||||
effect: "NoSchedule"
|
||||
containers:
|
||||
- name: selenium-hub
|
||||
image: selenium/hub:latest
|
||||
image: selenium/hub:4.16.1
|
||||
ports:
|
||||
- containerPort: 4444
|
||||
---
|
||||
@@ -30,15 +65,9 @@ spec:
|
||||
selector:
|
||||
app: selenium-hub
|
||||
ports:
|
||||
- port: 4444
|
||||
name: http
|
||||
targetPort: 4444
|
||||
- port: 4442
|
||||
name: publish
|
||||
targetPort: 4442
|
||||
- port: 4443
|
||||
name: subscribe
|
||||
targetPort: 4443
|
||||
- protocol: TCP
|
||||
port: 4444
|
||||
targetPort: 4444
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
@@ -54,20 +83,18 @@ spec:
|
||||
labels:
|
||||
app: selenium-node-chrome
|
||||
spec:
|
||||
tolerations:
|
||||
- key: "node.kubernetes.io/disk-pressure"
|
||||
operator: "Exists"
|
||||
effect: "NoSchedule"
|
||||
containers:
|
||||
- name: selenium-node-chrome
|
||||
image: selenium/node-chrome:latest
|
||||
image: selenium/node-chrome:4.16.1
|
||||
env:
|
||||
- name: SE_EVENT_BUS_HOST
|
||||
value: "selenium-hub"
|
||||
value: selenium-hub
|
||||
- name: SE_EVENT_BUS_PUBLISH_PORT
|
||||
value: "4442"
|
||||
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
||||
value: "4443"
|
||||
- name: SHM_SIZE
|
||||
value: "2g"
|
||||
volumeMounts:
|
||||
- name: dshm
|
||||
mountPath: /dev/shm
|
||||
@@ -75,43 +102,3 @@ spec:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: gisp-scraper
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: gisp-scraper
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: gisp-scraper
|
||||
spec:
|
||||
tolerations:
|
||||
- key: "node.kubernetes.io/disk-pressure"
|
||||
operator: "Exists"
|
||||
effect: "NoSchedule"
|
||||
containers:
|
||||
- name: gisp-scraper
|
||||
image: git.danilkolesnikov.ru/flash/gisp-scraper:latest
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
env:
|
||||
- name: SELENIUM_HUB_URL
|
||||
value: "http://selenium-hub:4444/wd/hub"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: gisp-scraper
|
||||
spec:
|
||||
selector:
|
||||
app: gisp-scraper
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 80
|
||||
targetPort: 8000
|
||||
type: ClusterIP
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
httpx
|
||||
selenium
|
||||
beautifulsoup4
|
||||
|
||||
Reference in New Issue
Block a user