147 lines
5.0 KiB
Python
147 lines
5.0 KiB
Python
import httpx
|
|
import re
|
|
import os
|
|
import time
|
|
import asyncio
|
|
from fastapi import FastAPI, HTTPException
|
|
from typing import List, Optional, Dict
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.chrome.options import Options
|
|
from bs4 import BeautifulSoup
|
|
|
|
app = FastAPI(title="GISP Scraper API (Selenium Mode)")
|
|
|
|
# Configuration
|
|
API_URL = "https://gisp.gov.ru/pp719v2/pub/prod/b/"
|
|
SELENIUM_HUB_URL = "http://selenium-hub:4444/wd/hub"
|
|
|
|
async def fetch_gisp_data(registry_number: str):
|
|
payload = {
|
|
"opt": {
|
|
"filter": ["product_reg_number_2023", "contains", registry_number]
|
|
}
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.post(API_URL, json=payload, timeout=30.0)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if "items" in data and len(data["items"]) > 0:
|
|
return data["items"]
|
|
return None
|
|
except Exception as e:
|
|
print(f"API scraping error: {e}")
|
|
return None
|
|
|
|
def get_driver():
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--remote-debugging-port=9222")
|
|
chrome_options.add_argument("--headless=new")
|
|
chrome_options.add_argument("--no-sandbox")
|
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
chrome_options.add_argument("--window-size=1920,1080")
|
|
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
chrome_options.set_capability("browserName", "chrome")
|
|
chrome_options.set_capability("platformName", "linux")
|
|
|
|
driver = webdriver.Remote(
|
|
command_executor="http://selenium-hub:4444/wd/hub",
|
|
options=chrome_options
|
|
)
|
|
return driver
|
|
|
|
def scrape_characteristics(url: str) -> Dict:
|
|
driver = None
|
|
try:
|
|
driver = get_driver()
|
|
driver.get(url)
|
|
|
|
# Wait for the page to load initial content
|
|
wait = WebDriverWait(driver, 20)
|
|
|
|
# Wait for the tab to be clickable
|
|
wait = WebDriverWait(driver, 15)
|
|
# Try to find all tabs by the specified class
|
|
tabs = driver.find_elements(By.CLASS_NAME, "ant-tabs-tab")
|
|
if len(tabs) >= 2:
|
|
# Use execute_script for a more reliable click on dynamic elements
|
|
driver.execute_script("arguments[0].click();", tabs[1])
|
|
time.sleep(5)
|
|
|
|
# Wait for product-characteristic elements
|
|
try:
|
|
wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "product-characteristic")))
|
|
except:
|
|
print("Timed out waiting for characteristics elements")
|
|
|
|
html = driver.page_source
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
characteristics = {}
|
|
|
|
# Strategy: find all rows that look like key-value pairs
|
|
rows = soup.find_all(class_="product-characteristic")
|
|
for row in rows:
|
|
name_el = row.find(class_="product-characteristic__name")
|
|
value_el = row.find(class_="product-characteristic__value")
|
|
|
|
if name_el and value_el:
|
|
l_text = name_el.get_text(strip=True)
|
|
v_text = value_el.get_text(strip=True)
|
|
if l_text:
|
|
characteristics[l_text] = v_text
|
|
|
|
return characteristics
|
|
except Exception as e:
|
|
print(f"Selenium error: {e}")
|
|
return {"error": str(e)}
|
|
finally:
|
|
if driver:
|
|
driver.quit()
|
|
|
|
@app.get("/details/{registry_number:path}")
|
|
async def get_product_details(registry_number: str):
|
|
registry_number = registry_number.strip()
|
|
|
|
results = await fetch_gisp_data(registry_number)
|
|
if not results:
|
|
raise HTTPException(status_code=404, detail=f"Product with registry number {registry_number} not found")
|
|
|
|
target_item = results[0]
|
|
product_gisp_url = target_item.get("product_gisp_url", "")
|
|
|
|
if not product_gisp_url:
|
|
return {
|
|
"registry_number": registry_number,
|
|
"basic_info": target_item,
|
|
"characteristics": {"error": "No product catalog URL found"}
|
|
}
|
|
|
|
# Run Selenium in a thread pool to avoid blocking the event loop
|
|
loop = asyncio.get_event_loop()
|
|
characteristics = await loop.run_in_executor(None, scrape_characteristics, product_gisp_url)
|
|
|
|
return {
|
|
"registry_number": registry_number,
|
|
"product_name": target_item.get("product_name"),
|
|
"manufacturer": {
|
|
"name": target_item.get("org_name"),
|
|
"inn": target_item.get("org_inn")
|
|
},
|
|
"technical_info": {
|
|
"okpd2": target_item.get("product_okpd2"),
|
|
"tnved": target_item.get("product_tnved")
|
|
},
|
|
"characteristics": characteristics,
|
|
"raw_data": target_item
|
|
}
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
return {"status": "ok"}
|