Add docker-compose and update Selenium scraping logic with target selectors
This commit is contained in:
136
app/main.py
136
app/main.py
@@ -1,23 +1,26 @@
|
|||||||
import httpx
|
import httpx
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import asyncio
|
||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from typing import List, Optional, Dict
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
app = FastAPI(title="GISP Scraper API (API-Direct)")
|
app = FastAPI(title="GISP Scraper API (Selenium Mode)")
|
||||||
|
|
||||||
# The API endpoint identified from network inspection
|
# Configuration
|
||||||
API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
|
API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
|
||||||
|
SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"
|
||||||
|
|
||||||
async def fetch_gisp_data(registry_number: str):
|
async def fetch_gisp_data(registry_number: str):
|
||||||
# Constructing the filter payload based on what we saw in the Network tab
|
|
||||||
# We will remove the restrictive date filters (res_valid_till, etc.)
|
|
||||||
payload = {
|
payload = {
|
||||||
"opt": {
|
"opt": {
|
||||||
"sort": None,
|
|
||||||
"requireTotalCount": True,
|
|
||||||
"searchOperation": "contains",
|
|
||||||
"searchValue": None,
|
|
||||||
"skip": 0,
|
|
||||||
"take": 10,
|
|
||||||
"userData": {},
|
|
||||||
"filter": ["product_reg_number_2023", "contains", registry_number]
|
"filter": ["product_reg_number_2023", "contains", registry_number]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -28,24 +31,115 @@ async def fetch_gisp_data(registry_number: str):
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
# GISP usually returns { "data": [ ... ], "totalCount": N }
|
if "items" in data and len(data["items"]) > 0:
|
||||||
if "data" in data and len(data["data"]) > 0:
|
return data["items"]
|
||||||
# Return the URL or specific entry found
|
|
||||||
# Based on the DevExtreme schema, we might need a specific ID to form the URL
|
|
||||||
return data["data"]
|
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"API scraping error: {e}")
|
print(f"API scraping error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@app.get("/scrape/{registry_number:path}")
|
def get_driver():
|
||||||
async def get_product_link(registry_number: str):
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--remote-debugging-port=9222")
|
||||||
|
chrome_options.add_argument("--headless=new")
|
||||||
|
chrome_options.add_argument("--no-sandbox")
|
||||||
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||||
|
chrome_options.add_argument("--window-size=1920,1080")
|
||||||
|
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||||
|
chrome_options.set_capability("browserName", "chrome")
|
||||||
|
chrome_options.set_capability("platformName", "linux")
|
||||||
|
|
||||||
|
driver = webdriver.Remote(
|
||||||
|
command_executor="http://selenium-hub:4444/wd/hub",
|
||||||
|
options=chrome_options
|
||||||
|
)
|
||||||
|
return driver
|
||||||
|
|
||||||
|
def scrape_characteristics(url: str) -> Dict:
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
driver = get_driver()
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# Wait for the page to load initial content
|
||||||
|
wait = WebDriverWait(driver, 20)
|
||||||
|
|
||||||
|
# Wait for the tab to be clickable
|
||||||
|
wait = WebDriverWait(driver, 15)
|
||||||
|
# Try to find all tabs by the specified class
|
||||||
|
tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
|
||||||
|
if len(tabs) >= 2:
|
||||||
|
# Use execute_script for a more reliable click on dynamic elements
|
||||||
|
driver.execute_script("arguments[0].click();", tabs[1])
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# Wait for product-characteristic elements
|
||||||
|
try:
|
||||||
|
wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
|
||||||
|
except:
|
||||||
|
print("Timed out waiting for characteristics elements")
|
||||||
|
|
||||||
|
html = driver.page_source
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
characteristics = {}
|
||||||
|
|
||||||
|
# Strategy: find all rows that look like key-value pairs
|
||||||
|
rows = soup.find_all(class_="product-characteristic")
|
||||||
|
for row in rows:
|
||||||
|
name_el = row.find(class_="product-characteristic__name")
|
||||||
|
value_el = row.find(class_="product-characteristic__value")
|
||||||
|
|
||||||
|
if name_el and value_el:
|
||||||
|
l_text = name_el.get_text(strip=True)
|
||||||
|
v_text = value_el.get_text(strip=True)
|
||||||
|
if l_text:
|
||||||
|
characteristics[l_text] = v_text
|
||||||
|
|
||||||
|
return characteristics
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selenium error: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
finally:
|
||||||
|
if driver:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
@app.get("/details/{registry_number:path}")
|
||||||
|
async def get_product_details(registry_number: str):
|
||||||
|
registry_number = registry_number.strip()
|
||||||
|
|
||||||
results = await fetch_gisp_data(registry_number)
|
results = await fetch_gisp_data(registry_number)
|
||||||
if not results:
|
if not results:
|
||||||
raise HTTPException(status_code=404, detail="Product not found or scraping failed")
|
raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")
|
||||||
|
|
||||||
# We can refine this to return the specific URL or the whole result object
|
target_item = results[0]
|
||||||
return {"registry_number": registry_number, "results": results}
|
product_gisp_url = target_item.get("product_gisp_url", "")
|
||||||
|
|
||||||
|
if not product_gisp_url:
|
||||||
|
return {
|
||||||
|
"registry_number": registry_number,
|
||||||
|
"basic_info": target_item,
|
||||||
|
"characteristics": {"error": "No product catalog URL found"}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run Selenium in a thread pool to avoid blocking the event loop
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"registry_number": registry_number,
|
||||||
|
"product_name": target_item.get("product_name"),
|
||||||
|
"manufacturer": {
|
||||||
|
"name": target_item.get("org_name"),
|
||||||
|
"inn": target_item.get("org_inn")
|
||||||
|
},
|
||||||
|
"technical_info": {
|
||||||
|
"okpd2": target_item.get("product_okpd2"),
|
||||||
|
"tnved": target_item.get("product_tnved")
|
||||||
|
},
|
||||||
|
"characteristics": characteristics,
|
||||||
|
"raw_data": target_item
|
||||||
|
}
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
def health():
|
def health():
|
||||||
|
|||||||
29
docker-compose.yml
Normal file
29
docker-compose.yml
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
gisp-scraper:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
environment:
|
||||||
|
- SELENIUM_HUB_URL=http://selenium-hub:4444
|
||||||
|
depends_on:
|
||||||
|
- selenium-hub
|
||||||
|
- selenium-node-chrome
|
||||||
|
|
||||||
|
selenium-hub:
|
||||||
|
image: selenium/hub:4.16.1
|
||||||
|
ports:
|
||||||
|
- "4444:4444"
|
||||||
|
|
||||||
|
selenium-node-chrome:
|
||||||
|
image: selenium/node-chrome:4.16.1
|
||||||
|
environment:
|
||||||
|
- SE_EVENT_BUS_HOST=selenium-hub
|
||||||
|
- SE_EVENT_BUS_PUBLISH_PORT=4442
|
||||||
|
- SE_EVENT_BUS_SUBSCRIBE_PORT=4443
|
||||||
|
- SHM_SIZE=2g
|
||||||
|
volumes:
|
||||||
|
- /dev/shm:/dev/shm
|
||||||
|
depends_on:
|
||||||
|
- selenium-hub
|
||||||
18
ingress.yaml
Normal file
18
ingress.yaml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: gisp-scraper-ingress
|
||||||
|
annotations:
|
||||||
|
traefik.ingress.kubernetes.io/router.entrypoints: web
|
||||||
|
spec:
|
||||||
|
rules:
|
||||||
|
- host: gisp-scraper.danilkolesnikov.ru
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: gisp-scraper
|
||||||
|
port:
|
||||||
|
number: 8000
|
||||||
105
k8s.yaml
105
k8s.yaml
@@ -1,5 +1,44 @@
|
|||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: gisp-scraper
|
||||||
|
labels:
|
||||||
|
app: gisp-scraper
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: gisp-scraper
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: gisp-scraper
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: gisp-scraper
|
||||||
|
image: git.danilkolesnikov.ru/flash/gisp-scraper:latest
|
||||||
|
env:
|
||||||
|
- name: SELENIUM_HUB_URL
|
||||||
|
value: "http://selenium-hub:4444/wd/hub"
|
||||||
|
ports:
|
||||||
|
- containerPort: 8000
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: gisp-scraper
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: gisp-scraper
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8000
|
||||||
|
targetPort: 8000
|
||||||
|
nodePort: 30001
|
||||||
|
type: NodePort
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
name: selenium-hub
|
name: selenium-hub
|
||||||
spec:
|
spec:
|
||||||
@@ -12,13 +51,9 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: selenium-hub
|
app: selenium-hub
|
||||||
spec:
|
spec:
|
||||||
tolerations:
|
|
||||||
- key: "node.kubernetes.io/disk-pressure"
|
|
||||||
operator: "Exists"
|
|
||||||
effect: "NoSchedule"
|
|
||||||
containers:
|
containers:
|
||||||
- name: selenium-hub
|
- name: selenium-hub
|
||||||
image: selenium/hub:latest
|
image: selenium/hub:4.16.1
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 4444
|
- containerPort: 4444
|
||||||
---
|
---
|
||||||
@@ -30,15 +65,9 @@ spec:
|
|||||||
selector:
|
selector:
|
||||||
app: selenium-hub
|
app: selenium-hub
|
||||||
ports:
|
ports:
|
||||||
- port: 4444
|
- protocol: TCP
|
||||||
name: http
|
port: 4444
|
||||||
targetPort: 4444
|
targetPort: 4444
|
||||||
- port: 4442
|
|
||||||
name: publish
|
|
||||||
targetPort: 4442
|
|
||||||
- port: 4443
|
|
||||||
name: subscribe
|
|
||||||
targetPort: 4443
|
|
||||||
---
|
---
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
@@ -54,20 +83,18 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: selenium-node-chrome
|
app: selenium-node-chrome
|
||||||
spec:
|
spec:
|
||||||
tolerations:
|
|
||||||
- key: "node.kubernetes.io/disk-pressure"
|
|
||||||
operator: "Exists"
|
|
||||||
effect: "NoSchedule"
|
|
||||||
containers:
|
containers:
|
||||||
- name: selenium-node-chrome
|
- name: selenium-node-chrome
|
||||||
image: selenium/node-chrome:latest
|
image: selenium/node-chrome:4.16.1
|
||||||
env:
|
env:
|
||||||
- name: SE_EVENT_BUS_HOST
|
- name: SE_EVENT_BUS_HOST
|
||||||
value: "selenium-hub"
|
value: selenium-hub
|
||||||
- name: SE_EVENT_BUS_PUBLISH_PORT
|
- name: SE_EVENT_BUS_PUBLISH_PORT
|
||||||
value: "4442"
|
value: "4442"
|
||||||
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
||||||
value: "4443"
|
value: "4443"
|
||||||
|
- name: SHM_SIZE
|
||||||
|
value: "2g"
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: dshm
|
- name: dshm
|
||||||
mountPath: /dev/shm
|
mountPath: /dev/shm
|
||||||
@@ -75,43 +102,3 @@ spec:
|
|||||||
- name: dshm
|
- name: dshm
|
||||||
emptyDir:
|
emptyDir:
|
||||||
medium: Memory
|
medium: Memory
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: gisp-scraper
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: gisp-scraper
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: gisp-scraper
|
|
||||||
spec:
|
|
||||||
tolerations:
|
|
||||||
- key: "node.kubernetes.io/disk-pressure"
|
|
||||||
operator: "Exists"
|
|
||||||
effect: "NoSchedule"
|
|
||||||
containers:
|
|
||||||
- name: gisp-scraper
|
|
||||||
image: git.danilkolesnikov.ru/flash/gisp-scraper:latest
|
|
||||||
ports:
|
|
||||||
- containerPort: 8000
|
|
||||||
env:
|
|
||||||
- name: SELENIUM_HUB_URL
|
|
||||||
value: "http://selenium-hub:4444/wd/hub"
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: gisp-scraper
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: gisp-scraper
|
|
||||||
ports:
|
|
||||||
- protocol: TCP
|
|
||||||
port: 80
|
|
||||||
targetPort: 8000
|
|
||||||
type: ClusterIP
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
fastapi
|
fastapi
|
||||||
uvicorn
|
uvicorn
|
||||||
httpx
|
httpx
|
||||||
|
selenium
|
||||||
|
beautifulsoup4
|
||||||
|
|||||||
Reference in New Issue
Block a user