781 lines
29 KiB
Python
781 lines
29 KiB
Python
"""中国政府采购网爬虫。"""
|
||
import json
|
||
import os
|
||
import re
|
||
import time
|
||
from datetime import datetime, timedelta, timezone
|
||
from typing import Dict, List, Optional, Tuple
|
||
from urllib.parse import quote, urlencode, urljoin
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
from rag import RAGProcessor
|
||
|
||
|
||
class CCGPCrawler:
|
||
"""中国政府采购网关键词爬虫。"""
|
||
|
||
SEARCH_URL = "https://search.ccgp.gov.cn/bxsearch"
|
||
SOURCE_NAME = "中国政府采购网"
|
||
LISTING_SOURCES = [
|
||
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/gkzb/", "project_type": "公开招标公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/zbgg/", "project_type": "中标公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/cjgg/", "project_type": "成交公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/jzxcs/", "project_type": "竞争性磋商公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/jzxtpgg/", "project_type": "竞争性谈判公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/xjgg/", "project_type": "询价公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/zygg/gkzb/", "project_type": "公开招标公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/zygg/zbgg/", "project_type": "中标公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/zygg/cjgg/", "project_type": "成交公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/zygg/jzxcs/", "project_type": "竞争性磋商公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/zygg/jzxtpgg/", "project_type": "竞争性谈判公告"},
|
||
{"url": "https://www.ccgp.gov.cn/cggg/zygg/xjgg/", "project_type": "询价公告"},
|
||
]
|
||
|
||
def __init__(
|
||
self,
|
||
timeout: int = 15,
|
||
request_delay: float = 1.0,
|
||
max_retries: int = 3,
|
||
use_browser_fallback: bool = True,
|
||
browser_headless: bool = True,
|
||
):
|
||
self.timeout = timeout
|
||
self.request_delay = request_delay
|
||
self.max_retries = max_retries
|
||
self.use_browser_fallback = use_browser_fallback
|
||
self.browser_headless = browser_headless
|
||
self.session = requests.Session()
|
||
self.session.headers.update(
|
||
{
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
|
||
),
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||
"Cache-Control": "no-cache",
|
||
"Pragma": "no-cache",
|
||
"Connection": "keep-alive",
|
||
}
|
||
)
|
||
self.diagnostics: List[Dict] = []
|
||
self.last_error: str = ""
|
||
|
||
def _record_diagnostic(self, event: str, **payload) -> None:
|
||
diagnostic = {
|
||
"event": event,
|
||
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
}
|
||
diagnostic.update(payload)
|
||
self.diagnostics.append(diagnostic)
|
||
|
||
def _sleep(self, seconds: Optional[float] = None) -> None:
|
||
time.sleep(self.request_delay if seconds is None else seconds)
|
||
|
||
def _is_blocked_response(self, html: str, title: str = "") -> Tuple[bool, str]:
|
||
signals = [
|
||
"频繁访问",
|
||
"访问过于频繁",
|
||
"操作过于频繁",
|
||
"禁止访问",
|
||
"访问异常",
|
||
"安全验证",
|
||
"403 Forbidden",
|
||
"抱歉,您的请求被阻断了",
|
||
"You are unable to access",
|
||
"Internal Server Error",
|
||
]
|
||
check_text = f"{title}\n{html[:3000]}"
|
||
for signal in signals:
|
||
if signal in check_text:
|
||
return True, signal
|
||
return False, ""
|
||
|
||
def _decode_response(self, response: requests.Response) -> str:
|
||
encodings = []
|
||
for encoding in ("utf-8", response.apparent_encoding, response.encoding, "gb18030"):
|
||
if encoding and encoding not in encodings:
|
||
encodings.append(encoding)
|
||
|
||
for encoding in encodings:
|
||
try:
|
||
return response.content.decode(encoding)
|
||
except UnicodeDecodeError:
|
||
continue
|
||
|
||
return response.text
|
||
|
||
def _request_html(
|
||
self,
|
||
url: str,
|
||
params: Optional[Dict] = None,
|
||
allow_browser_fallback: bool = True,
|
||
) -> Tuple[Optional[str], Optional[str], str]:
|
||
last_error = ""
|
||
for attempt in range(1, self.max_retries + 1):
|
||
try:
|
||
response = self.session.get(url, params=params, timeout=self.timeout)
|
||
html = self._decode_response(response)
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
title = soup.title.get_text(strip=True) if soup.title else ""
|
||
|
||
blocked, signal = self._is_blocked_response(html, title)
|
||
if blocked:
|
||
last_error = f"site_blocked:{signal}"
|
||
self._record_diagnostic(
|
||
"blocked_response",
|
||
request_url=response.url,
|
||
status_code=response.status_code,
|
||
page_title=title,
|
||
signal=signal,
|
||
)
|
||
break
|
||
|
||
if response.status_code >= 400:
|
||
last_error = f"http_status:{response.status_code}"
|
||
self._record_diagnostic(
|
||
"http_error",
|
||
request_url=response.url,
|
||
status_code=response.status_code,
|
||
page_title=title,
|
||
)
|
||
else:
|
||
return html, title, response.url
|
||
except requests.RequestException as exc:
|
||
last_error = str(exc)
|
||
self._record_diagnostic(
|
||
"request_exception",
|
||
request_url=self._build_url(url, params),
|
||
error=str(exc),
|
||
attempt=attempt,
|
||
)
|
||
|
||
if attempt < self.max_retries:
|
||
self._sleep(min(2.0 * attempt, 5.0))
|
||
|
||
if allow_browser_fallback and self.use_browser_fallback:
|
||
browser_html, browser_title, browser_url, browser_error = self._request_html_by_browser(
|
||
self._build_url(url, params)
|
||
)
|
||
if browser_html:
|
||
return browser_html, browser_title, browser_url
|
||
if browser_error:
|
||
last_error = f"{last_error}; browser:{browser_error}" if last_error else browser_error
|
||
|
||
self.last_error = last_error
|
||
return None, None, self._build_url(url, params)
|
||
|
||
def _request_html_by_browser(self, url: str) -> Tuple[Optional[str], Optional[str], str, str]:
|
||
try:
|
||
from selenium import webdriver
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.edge.options import Options as EdgeOptions
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
except Exception as exc:
|
||
error = f"selenium_unavailable:{exc}"
|
||
self._record_diagnostic("browser_unavailable", request_url=url, error=error)
|
||
return None, None, url, error
|
||
|
||
driver = None
|
||
try:
|
||
options = EdgeOptions()
|
||
if self.browser_headless:
|
||
options.add_argument("--headless=new")
|
||
options.add_argument("--disable-gpu")
|
||
options.add_argument("--window-size=1600,1200")
|
||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||
options.add_argument("--no-sandbox")
|
||
options.add_argument(
|
||
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
|
||
)
|
||
|
||
driver = webdriver.Edge(options=options)
|
||
driver.set_page_load_timeout(self.timeout + 10)
|
||
driver.get(url)
|
||
WebDriverWait(driver, self.timeout).until(
|
||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||
)
|
||
time.sleep(2)
|
||
html = driver.page_source
|
||
title = driver.title or ""
|
||
current_url = driver.current_url
|
||
|
||
blocked, signal = self._is_blocked_response(html, title)
|
||
if blocked:
|
||
error = f"site_blocked:{signal}"
|
||
self._record_diagnostic(
|
||
"browser_blocked_response",
|
||
request_url=current_url,
|
||
page_title=title,
|
||
signal=signal,
|
||
)
|
||
return None, None, current_url, error
|
||
|
||
self._record_diagnostic("browser_fallback_success", request_url=current_url, page_title=title)
|
||
return html, title, current_url, ""
|
||
except Exception as exc:
|
||
error = str(exc)
|
||
self._record_diagnostic("browser_exception", request_url=url, error=error)
|
||
return None, None, url, error
|
||
finally:
|
||
if driver is not None:
|
||
try:
|
||
driver.quit()
|
||
except Exception:
|
||
pass
|
||
|
||
def _build_url(self, url: str, params: Optional[Dict]) -> str:
|
||
if not params:
|
||
return url
|
||
return f"{url}?{urlencode(params, doseq=True)}"
|
||
|
||
def _build_search_url(self, params: Dict) -> str:
|
||
"""Build search URL with explicit UTF-8 encoding for the keyword."""
|
||
encoded_parts = []
|
||
for key, value in params.items():
|
||
if value is None:
|
||
value = ""
|
||
if isinstance(value, str):
|
||
encoded_value = quote(value, safe="", encoding="utf-8", errors="strict")
|
||
else:
|
||
encoded_value = quote(str(value), safe="", encoding="utf-8", errors="strict")
|
||
encoded_parts.append(f"{key}={encoded_value}")
|
||
return f"{self.SEARCH_URL}?{'&'.join(encoded_parts)}"
|
||
|
||
def _normalize_date(self, raw_date: str) -> str:
|
||
raw_date = raw_date.strip()
|
||
match = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})", raw_date)
|
||
if not match:
|
||
return raw_date
|
||
year, month, day = match.groups()
|
||
return f"{int(year):04d}-{int(month):02d}-{int(day):02d}"
|
||
|
||
def _normalize_datetime(self, raw_text: str) -> str:
|
||
raw_text = raw_text.strip()
|
||
match = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})(?:日)?\s*(\d{1,2}:\d{2})?", raw_text)
|
||
if not match:
|
||
return raw_text
|
||
year, month, day, time_part = match.groups()
|
||
date_part = f"{int(year):04d}-{int(month):02d}-{int(day):02d}"
|
||
return f"{date_part} {time_part}" if time_part else date_part
|
||
|
||
@staticmethod
|
||
def _build_std_timestamp(date_text: str) -> int:
|
||
"""Build a Unix timestamp for 08:00:00 Asia/Shanghai on the given date."""
|
||
if not date_text:
|
||
return 0
|
||
try:
|
||
beijing_tz = timezone(timedelta(hours=8))
|
||
dt = datetime.strptime(date_text, "%Y-%m-%d").replace(
|
||
hour=8,
|
||
minute=0,
|
||
second=0,
|
||
microsecond=0,
|
||
tzinfo=beijing_tz,
|
||
)
|
||
return int(dt.timestamp())
|
||
except ValueError:
|
||
return 0
|
||
|
||
def _extract_total(self, soup: BeautifulSoup) -> int:
|
||
text = soup.get_text(" ", strip=True)
|
||
patterns = [
|
||
r"共\s*(\d+)\s*条",
|
||
r"约\s*(\d+)\s*条",
|
||
r"找到\s*(\d+)\s*条",
|
||
]
|
||
for pattern in patterns:
|
||
match = re.search(pattern, text)
|
||
if match:
|
||
return int(match.group(1))
|
||
return 0
|
||
|
||
def _extract_result_nodes(self, soup: BeautifulSoup) -> List:
|
||
selectors = [
|
||
"ul.vT-srch-result-list-bid > li",
|
||
".vT-srch-result-list-bid li",
|
||
".vT-srch-result-list li",
|
||
".search-result li",
|
||
"li",
|
||
]
|
||
for selector in selectors:
|
||
nodes = soup.select(selector)
|
||
candidate_nodes = []
|
||
for node in nodes:
|
||
link = node.find("a", href=True)
|
||
if not link:
|
||
continue
|
||
href = link["href"].strip()
|
||
title = link.get_text(" ", strip=True)
|
||
if len(title) < 4:
|
||
continue
|
||
if "ccgp.gov.cn" not in href and not href.startswith("/"):
|
||
continue
|
||
candidate_nodes.append(node)
|
||
if candidate_nodes:
|
||
return candidate_nodes
|
||
return []
|
||
|
||
def _parse_search_items(self, soup: BeautifulSoup, keyword: str) -> List[Dict]:
|
||
items: List[Dict] = []
|
||
for node in self._extract_result_nodes(soup):
|
||
link = node.find("a", href=True)
|
||
if not link:
|
||
continue
|
||
|
||
title = re.sub(r"\s+", " ", link.get_text(" ", strip=True))
|
||
href = urljoin(self.SEARCH_URL, link["href"].strip())
|
||
node_text = re.sub(r"\s+", " ", node.get_text(" ", strip=True))
|
||
|
||
date_match = re.search(r"\d{4}[-/.年]\d{1,2}[-/.月]\d{1,2}", node_text)
|
||
date = self._normalize_date(date_match.group(0)) if date_match else ""
|
||
std_timestamp = self._build_std_timestamp(date)
|
||
|
||
region = ""
|
||
region_match = re.search(r"(?:地区|地域|行政区划)[::]?\s*([^\s|]+)", node_text)
|
||
if region_match:
|
||
region = region_match.group(1)
|
||
else:
|
||
span_texts = [span.get_text(" ", strip=True) for span in node.find_all(["span", "em"])]
|
||
for span_text in span_texts:
|
||
if "地区" in span_text or "地域" in span_text:
|
||
region = span_text.split(":")[-1].split(":")[-1].strip()
|
||
break
|
||
|
||
project_type = ""
|
||
known_types = [
|
||
"公开招标公告",
|
||
"竞争性磋商公告",
|
||
"竞争性谈判公告",
|
||
"询价公告",
|
||
"成交公告",
|
||
"中标公告",
|
||
"更正公告",
|
||
"废标公告",
|
||
"单一来源公告",
|
||
"采购公告",
|
||
"招标公告",
|
||
]
|
||
for known_type in known_types:
|
||
if known_type in node_text:
|
||
project_type = known_type
|
||
break
|
||
|
||
items.append(
|
||
{
|
||
"title": title,
|
||
"url": href,
|
||
"date": date,
|
||
"std_timestamp": std_timestamp,
|
||
"project_type": project_type,
|
||
"region": region,
|
||
"keyword": keyword,
|
||
"source": self.SOURCE_NAME,
|
||
"crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"content": "",
|
||
}
|
||
)
|
||
return items
|
||
|
||
def _build_listing_page_url(self, base_url: str, page_index: int) -> str:
|
||
if page_index <= 1:
|
||
return base_url
|
||
return urljoin(base_url, f"index_{page_index - 1}.htm")
|
||
|
||
def _extract_listing_page_count(self, html: str) -> int:
|
||
match = re.search(r"Pager\(\{size:(\d+),\s*current:\d+,\s*prefix:'index',suffix:'htm'\}\)", html)
|
||
if not match:
|
||
return 1
|
||
return max(int(match.group(1)), 1)
|
||
|
||
def _parse_listing_items(self, html: str, page_url: str, keyword: str, project_type: str) -> List[Dict]:
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
items: List[Dict] = []
|
||
for node in soup.select("ul.c_list_bid > li"):
|
||
link = node.find("a", href=True)
|
||
if not link:
|
||
continue
|
||
|
||
title = re.sub(r"\s+", " ", link.get_text(" ", strip=True))
|
||
href = urljoin(page_url, link["href"].strip())
|
||
meta_values = [em.get_text(" ", strip=True) for em in node.find_all("em")]
|
||
publish_time = self._normalize_datetime(meta_values[0]) if len(meta_values) > 0 else ""
|
||
region = meta_values[1] if len(meta_values) > 1 else ""
|
||
date = publish_time.split(" ")[0] if publish_time else ""
|
||
std_timestamp = self._build_std_timestamp(date)
|
||
|
||
items.append(
|
||
{
|
||
"title": title,
|
||
"url": href,
|
||
"date": date,
|
||
"std_timestamp": std_timestamp,
|
||
"project_type": project_type,
|
||
"region": region,
|
||
"keyword": keyword,
|
||
"source": self.SOURCE_NAME,
|
||
"crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"content": "",
|
||
}
|
||
)
|
||
return items
|
||
|
||
def _scan_listing_pages_for_keyword(
|
||
self,
|
||
keyword: str,
|
||
max_items: int = 20,
|
||
max_pages_per_category: int = 3,
|
||
) -> List[Dict]:
|
||
keyword = keyword.strip()
|
||
title_candidates: List[Dict] = []
|
||
related_candidates: List[Dict] = []
|
||
seen_urls = set()
|
||
transport_hints = [
|
||
keyword,
|
||
"交通",
|
||
"信号",
|
||
"道路",
|
||
"路口",
|
||
"绿波",
|
||
"信控",
|
||
"交警",
|
||
"公交",
|
||
"轨道",
|
||
"导航",
|
||
"地图",
|
||
"信号灯",
|
||
"红绿灯",
|
||
"诱导",
|
||
]
|
||
|
||
for source in self.LISTING_SOURCES:
|
||
total_pages = 1
|
||
for page_index in range(1, max_pages_per_category + 1):
|
||
page_url = self._build_listing_page_url(source["url"], page_index)
|
||
html, _, final_url = self._request_html(
|
||
page_url,
|
||
params=None,
|
||
allow_browser_fallback=False,
|
||
)
|
||
if not html:
|
||
continue
|
||
|
||
if page_index == 1:
|
||
total_pages = self._extract_listing_page_count(html)
|
||
|
||
items = self._parse_listing_items(
|
||
html=html,
|
||
page_url=final_url or page_url,
|
||
keyword=keyword,
|
||
project_type=source["project_type"],
|
||
)
|
||
for item in items:
|
||
if item["url"] in seen_urls:
|
||
continue
|
||
seen_urls.add(item["url"])
|
||
|
||
title = item["title"]
|
||
if keyword in title:
|
||
title_candidates.append(item)
|
||
elif any(hint and hint in title for hint in transport_hints):
|
||
related_candidates.append(item)
|
||
|
||
if page_index >= total_pages:
|
||
break
|
||
self._sleep(0.4)
|
||
|
||
results: List[Dict] = []
|
||
fetch_queue = title_candidates + related_candidates
|
||
for item in fetch_queue:
|
||
content = self.fetch_detail_content(item["url"])
|
||
item["content"] = content
|
||
haystack = f"{item['title']}\n{content}"
|
||
if keyword not in haystack:
|
||
continue
|
||
results.append(item)
|
||
if len(results) >= max_items:
|
||
break
|
||
self._sleep(0.4)
|
||
|
||
self._record_diagnostic(
|
||
"listing_fallback_scan",
|
||
keyword=keyword,
|
||
title_candidates=len(title_candidates),
|
||
related_candidates=len(related_candidates),
|
||
matched=len(results),
|
||
max_pages_per_category=max_pages_per_category,
|
||
)
|
||
return results
|
||
|
||
def search_by_keyword(self, keyword: str, page_index: int = 1, bid_type: int = 0) -> Dict:
|
||
params = {
|
||
"searchtype": 1,
|
||
"page_index": page_index,
|
||
"bidSort": 0,
|
||
"buyerName": "",
|
||
"projectId": "",
|
||
"pinMu": 0,
|
||
"bidType": bid_type,
|
||
"dbselect": "bidx",
|
||
"kw": keyword,
|
||
"start_time": "",
|
||
"end_time": "",
|
||
"timeType": 6,
|
||
"displayZone": "",
|
||
"zoneId": "",
|
||
"agentName": "",
|
||
}
|
||
|
||
search_url = self._build_search_url(params)
|
||
html, title, request_url = self._request_html(search_url, params=None)
|
||
if not html:
|
||
if "site_blocked" in self.last_error and page_index == 1:
|
||
fallback_items = self._scan_listing_pages_for_keyword(keyword, max_items=20)
|
||
return {
|
||
"keyword": keyword,
|
||
"page_index": page_index,
|
||
"total": len(fallback_items),
|
||
"items": fallback_items,
|
||
"success": True,
|
||
"blocked": False,
|
||
"message": (
|
||
"search_endpoint_blocked_used_listing_fallback"
|
||
if fallback_items
|
||
else "search_endpoint_blocked_listing_fallback_no_match"
|
||
),
|
||
"request_url": request_url,
|
||
"fallback_mode": "listing_scan",
|
||
}
|
||
if "site_blocked" in self.last_error and page_index > 1:
|
||
return {
|
||
"keyword": keyword,
|
||
"page_index": page_index,
|
||
"total": 0,
|
||
"items": [],
|
||
"success": True,
|
||
"blocked": False,
|
||
"message": "search_endpoint_blocked_stop_paging",
|
||
"request_url": request_url,
|
||
"fallback_mode": "listing_scan",
|
||
}
|
||
return {
|
||
"keyword": keyword,
|
||
"page_index": page_index,
|
||
"total": 0,
|
||
"items": [],
|
||
"success": False,
|
||
"blocked": "site_blocked" in self.last_error,
|
||
"message": self.last_error or "request_failed",
|
||
"request_url": request_url,
|
||
}
|
||
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
items = self._parse_search_items(soup, keyword)
|
||
total = self._extract_total(soup)
|
||
|
||
if not items:
|
||
self._record_diagnostic(
|
||
"empty_search_parse",
|
||
keyword=keyword,
|
||
page_index=page_index,
|
||
request_url=request_url,
|
||
page_title=title or "",
|
||
)
|
||
|
||
return {
|
||
"keyword": keyword,
|
||
"page_index": page_index,
|
||
"total": total,
|
||
"items": items,
|
||
"success": True,
|
||
"blocked": False,
|
||
"message": "",
|
||
"request_url": request_url,
|
||
}
|
||
|
||
def fetch_detail_content(self, url: str) -> str:
|
||
html, _, request_url = self._request_html(url, params=None)
|
||
if not html:
|
||
self._record_diagnostic("detail_fetch_failed", request_url=request_url, error=self.last_error)
|
||
return ""
|
||
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
selectors = [
|
||
"div.vF_detail_content_container",
|
||
"div.vF_detail_content",
|
||
"div#zoom",
|
||
"div.content",
|
||
"div.article",
|
||
"div.article-content",
|
||
"div.contxt",
|
||
"body",
|
||
]
|
||
|
||
content_node = None
|
||
for selector in selectors:
|
||
node = soup.select_one(selector)
|
||
if node and len(node.get_text(" ", strip=True)) > 100:
|
||
content_node = node
|
||
break
|
||
|
||
if content_node is None:
|
||
self._record_diagnostic("detail_parse_empty", request_url=request_url)
|
||
return ""
|
||
|
||
for tag in content_node.find_all(["script", "style", "noscript"]):
|
||
tag.decompose()
|
||
|
||
text = content_node.get_text("\n", strip=True)
|
||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||
return text.strip()
|
||
|
||
def crawl_by_keywords(self, keywords: List[str], max_per_keyword: int = 30) -> List[Dict]:
|
||
all_results: List[Dict] = []
|
||
seen_urls = set()
|
||
|
||
for keyword in keywords:
|
||
print("\n" + "=" * 60)
|
||
print(f"正在搜索关键词: {keyword}")
|
||
print("=" * 60)
|
||
|
||
keyword_results: List[Dict] = []
|
||
page_index = 1
|
||
blocked = False
|
||
|
||
while len(keyword_results) < max_per_keyword:
|
||
print(f"[进度] 正在获取第 {page_index} 页...")
|
||
search_result = self.search_by_keyword(keyword, page_index=page_index)
|
||
|
||
if not search_result["success"]:
|
||
print(f"[失败] 搜索失败: {search_result['message']}")
|
||
blocked = search_result.get("blocked", False)
|
||
break
|
||
|
||
items = search_result["items"]
|
||
if not items:
|
||
print("[结束] 当前页未解析到结果")
|
||
break
|
||
|
||
print(f"[成功] 本页解析 {len(items)} 条结果(页面总数提示: {search_result['total']})")
|
||
|
||
for item in items:
|
||
if item["url"] in seen_urls:
|
||
continue
|
||
|
||
print(f" [抓取] {item['title'][:60]}")
|
||
item["content"] = self.fetch_detail_content(item["url"])
|
||
seen_urls.add(item["url"])
|
||
keyword_results.append(item)
|
||
all_results.append(item)
|
||
|
||
if len(keyword_results) >= max_per_keyword:
|
||
break
|
||
self._sleep()
|
||
|
||
if len(items) < 20:
|
||
break
|
||
|
||
page_index += 1
|
||
self._sleep(max(1.5, self.request_delay))
|
||
|
||
print(f"[完成] 关键词 '{keyword}' 共获取 {len(keyword_results)} 条结果")
|
||
if blocked:
|
||
print("[提示] 当前环境可能触发了政府采购网的频率限制,已记录诊断信息")
|
||
|
||
return all_results
|
||
|
||
def save_results(self, results: List[Dict], output_dir: str = "./data") -> str:
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
output_path = os.path.join(output_dir, f"ccgp_results_{timestamp}.json")
|
||
with open(output_path, "w", encoding="utf-8") as file:
|
||
json.dump(results, file, ensure_ascii=False, indent=2)
|
||
print(f"[保存] 结果已保存到: {output_path}")
|
||
return output_path
|
||
|
||
def save_diagnostics(self, output_dir: str = "./data") -> Optional[str]:
|
||
if not self.diagnostics:
|
||
return None
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
output_path = os.path.join(output_dir, f"ccgp_probe_{timestamp}.json")
|
||
payload = {
|
||
"source": self.SOURCE_NAME,
|
||
"test_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"success": False,
|
||
"items": [],
|
||
"diagnosis": self.diagnostics,
|
||
}
|
||
with open(output_path, "w", encoding="utf-8") as file:
|
||
json.dump(payload, file, ensure_ascii=False, indent=2)
|
||
print(f"[保存] 诊断信息已保存到: {output_path}")
|
||
return output_path
|
||
|
||
def save_to_vectordb(self, results: List[Dict], vector_db_dir: str = "./vector_db") -> None:
|
||
if not results:
|
||
print("[RAG] 无可入库的数据")
|
||
return
|
||
|
||
rag_items = []
|
||
for item in results:
|
||
content = item.get("content", "").strip()
|
||
if not content:
|
||
continue
|
||
rag_items.append(
|
||
{
|
||
"title": item.get("title", ""),
|
||
"content": f"标题: {item.get('title', '')}\n\n{content}",
|
||
"url": item.get("url", ""),
|
||
"source": item.get("source", self.SOURCE_NAME),
|
||
"std_timestamp": item.get("std_timestamp", 0),
|
||
}
|
||
)
|
||
|
||
if not rag_items:
|
||
print("[RAG] 抓取结果没有可用正文,跳过向量库写入")
|
||
return
|
||
|
||
processor = RAGProcessor(vector_db_dir=vector_db_dir)
|
||
processor.process_news(rag_items, upsert=True)
|
||
stats = processor.get_database_stats()
|
||
print(
|
||
f"[RAG] 向量数据库现有 {stats['unique_news']} 条新闻,"
|
||
f"{stats['total_documents']} 个文档片段"
|
||
)
|
||
|
||
def crawl_and_save(
|
||
self,
|
||
keywords: List[str],
|
||
max_per_keyword: int = 30,
|
||
output_dir: str = "./data",
|
||
save_to_rag: bool = False,
|
||
vector_db_dir: str = "./vector_db",
|
||
) -> List[Dict]:
|
||
results = self.crawl_by_keywords(keywords=keywords, max_per_keyword=max_per_keyword)
|
||
|
||
if results:
|
||
self.save_results(results, output_dir=output_dir)
|
||
if save_to_rag:
|
||
self.save_to_vectordb(results, vector_db_dir=vector_db_dir)
|
||
else:
|
||
self.save_diagnostics(output_dir=output_dir)
|
||
|
||
return results
|
||
|
||
|
||
if __name__ == "__main__":
|
||
crawler = CCGPCrawler()
|
||
items = crawler.crawl_and_save(
|
||
keywords=["信控", "绿波"],
|
||
max_per_keyword=10,
|
||
output_dir="./data",
|
||
save_to_rag=False,
|
||
)
|
||
print(f"\n爬取完成,共获取 {len(items)} 条结果")
|