Files
gisp-pp719v2-scraper/app/main.py

147 lines
5.0 KiB
Python

import httpx
import re
import os
import time
import asyncio
from fastapi import FastAPI, HTTPException
from typing import List, Optional, Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
app = FastAPI(title="GISP Scraper API (Selenium Mode)")
# Configuration
API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"
async def fetch_gisp_data(registry_number: str):
payload = {
"opt": {
"filter": ["product_reg_number_2023", "contains", registry_number]
}
}
async with httpx.AsyncClient() as client:
try:
response = await client.post(API_URL, json=payload, timeout=30.0)
response.raise_for_status()
data = response.json()
if "items" in data and len(data["items"]) > 0:
return data["items"]
return None
except Exception as e:
print(f"API scraping error: {e}")
return None
def get_driver():
chrome_options = Options()
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
chrome_options.set_capability("browserName", "chrome")
chrome_options.set_capability("platformName", "linux")
driver = webdriver.Remote(
command_executor="http://selenium-hub:4444/wd/hub",
options=chrome_options
)
return driver
def scrape_characteristics(url: str) -> Dict:
driver = None
try:
driver = get_driver()
driver.get(url)
# Wait for the page to load initial content
wait = WebDriverWait(driver, 20)
# Wait for the tab to be clickable
wait = WebDriverWait(driver, 15)
# Try to find all tabs by the specified class
tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
if len(tabs) >= 2:
# Use execute_script for a more reliable click on dynamic elements
driver.execute_script("arguments[0].click();", tabs[1])
time.sleep(5)
# Wait for product-characteristic elements
try:
wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
except:
print("Timed out waiting for characteristics elements")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
characteristics = {}
# Strategy: find all rows that look like key-value pairs
rows = soup.find_all(class_="product-characteristic")
for row in rows:
name_el = row.find(class_="product-characteristic__name")
value_el = row.find(class_="product-characteristic__value")
if name_el and value_el:
l_text = name_el.get_text(strip=True)
v_text = value_el.get_text(strip=True)
if l_text:
characteristics[l_text] = v_text
return characteristics
except Exception as e:
print(f"Selenium error: {e}")
return {"error": str(e)}
finally:
if driver:
driver.quit()
@app.get("/details/{registry_number:path}")
async def get_product_details(registry_number: str):
registry_number = registry_number.strip()
results = await fetch_gisp_data(registry_number)
if not results:
raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")
target_item = results[0]
product_gisp_url = target_item.get("product_gisp_url", "")
if not product_gisp_url:
return {
"registry_number": registry_number,
"basic_info": target_item,
"characteristics": {"error": "No product catalog URL found"}
}
# Run Selenium in a thread pool to avoid blocking the event loop
loop = asyncio.get_event_loop()
characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)
return {
"registry_number": registry_number,
"product_name": target_item.get("product_name"),
"manufacturer": {
"name": target_item.get("org_name"),
"inn": target_item.get("org_inn")
},
"technical_info": {
"okpd2": target_item.get("product_okpd2"),
"tnved": target_item.get("product_tnved")
},
"characteristics": characteristics,
"raw_data": target_item
}
@app.get("/health")
def health():
return {"status": "ok"}