Add docker-compose and update Selenium scraping logic with target selectors

This commit is contained in:
Flash
2026-04-10 19:56:19 +00:00
parent 75f51121ea
commit 1b62fe732c
5 changed files with 211 additions and 81 deletions

View File

@@ -1,23 +1,26 @@
import httpx
import re
import os
import time
import asyncio
from fastapi import FastAPI, HTTPException
from typing import List, Optional, Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
app = FastAPI(title="GISP Scraper API (API-Direct)")
app = FastAPI(title="GISP Scraper API (Selenium Mode)")
# The API endpoint identified from network inspection
# Configuration
API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"
async def fetch_gisp_data(registry_number: str):
# Constructing the filter payload based on what we saw in the Network tab
# We will remove the restrictive date filters (res_valid_till, etc.)
payload = {
"opt": {
"sort": None,
"requireTotalCount": True,
"searchOperation": "contains",
"searchValue": None,
"skip": 0,
"take": 10,
"userData": {},
"filter": ["product_reg_number_2023", "contains", registry_number]
}
}
@@ -28,24 +31,115 @@ async def fetch_gisp_data(registry_number: str):
response.raise_for_status()
data = response.json()
# GISP usually returns { "data": [ ... ], "totalCount": N }
if "data" in data and len(data["data"]) > 0:
# Return the URL or specific entry found
# Based on the DevExtreme schema, we might need a specific ID to form the URL
return data["data"]
if "items" in data and len(data["items"]) > 0:
return data["items"]
return None
except Exception as e:
print(f"API scraping error: {e}")
return None
@app.get("/scrape/{registry_number:path}")
async def get_product_link(registry_number: str):
def get_driver():
chrome_options = Options()
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
chrome_options.set_capability("browserName", "chrome")
chrome_options.set_capability("platformName", "linux")
driver = webdriver.Remote(
command_executor="http://selenium-hub:4444/wd/hub",
options=chrome_options
)
return driver
def scrape_characteristics(url: str) -> Dict:
driver = None
try:
driver = get_driver()
driver.get(url)
# Wait for the page to load initial content
wait = WebDriverWait(driver, 20)
# Wait for the tab to be clickable
wait = WebDriverWait(driver, 15)
# Try to find all tabs by the specified class
tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
if len(tabs) >= 2:
# Use execute_script for a more reliable click on dynamic elements
driver.execute_script("arguments[0].click();", tabs[1])
time.sleep(5)
# Wait for product-characteristic elements
try:
wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
except:
print("Timed out waiting for characteristics elements")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
characteristics = {}
# Strategy: find all rows that look like key-value pairs
rows = soup.find_all(class_="product-characteristic")
for row in rows:
name_el = row.find(class_="product-characteristic__name")
value_el = row.find(class_="product-characteristic__value")
if name_el and value_el:
l_text = name_el.get_text(strip=True)
v_text = value_el.get_text(strip=True)
if l_text:
characteristics[l_text] = v_text
return characteristics
except Exception as e:
print(f"Selenium error: {e}")
return {"error": str(e)}
finally:
if driver:
driver.quit()
@app.get("/details/{registry_number:path}")
async def get_product_details(registry_number: str):
registry_number = registry_number.strip()
results = await fetch_gisp_data(registry_number)
if not results:
raise HTTPException(status_code=404, detail="Product not found or scraping failed")
raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")
# We can refine this to return the specific URL or the whole result object
return {"registry_number": registry_number, "results": results}
target_item = results[0]
product_gisp_url = target_item.get("product_gisp_url", "")
if not product_gisp_url:
return {
"registry_number": registry_number,
"basic_info": target_item,
"characteristics": {"error": "No product catalog URL found"}
}
# Run Selenium in a thread pool to avoid blocking the event loop
loop = asyncio.get_event_loop()
characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)
return {
"registry_number": registry_number,
"product_name": target_item.get("product_name"),
"manufacturer": {
"name": target_item.get("org_name"),
"inn": target_item.get("org_inn")
},
"technical_info": {
"okpd2": target_item.get("product_okpd2"),
"tnved": target_item.get("product_tnved")
},
"characteristics": characteristics,
"raw_data": target_item
}
@app.get("/health")
def health():