newsreport_agent_for_traffic/crawler/ccgp_crawler.py

781 lines
29 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""中国政府采购网爬虫。"""
import json
import os
import re
import time
from datetime import datetime, timedelta, timezone
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote, urlencode, urljoin
import requests
from bs4 import BeautifulSoup
from rag import RAGProcessor
class CCGPCrawler:
"""中国政府采购网关键词爬虫。"""
SEARCH_URL = "https://search.ccgp.gov.cn/bxsearch"
SOURCE_NAME = "中国政府采购网"
LISTING_SOURCES = [
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/gkzb/", "project_type": "公开招标公告"},
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/zbgg/", "project_type": "中标公告"},
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/cjgg/", "project_type": "成交公告"},
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/jzxcs/", "project_type": "竞争性磋商公告"},
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/jzxtpgg/", "project_type": "竞争性谈判公告"},
{"url": "https://www.ccgp.gov.cn/cggg/dfgg/xjgg/", "project_type": "询价公告"},
{"url": "https://www.ccgp.gov.cn/cggg/zygg/gkzb/", "project_type": "公开招标公告"},
{"url": "https://www.ccgp.gov.cn/cggg/zygg/zbgg/", "project_type": "中标公告"},
{"url": "https://www.ccgp.gov.cn/cggg/zygg/cjgg/", "project_type": "成交公告"},
{"url": "https://www.ccgp.gov.cn/cggg/zygg/jzxcs/", "project_type": "竞争性磋商公告"},
{"url": "https://www.ccgp.gov.cn/cggg/zygg/jzxtpgg/", "project_type": "竞争性谈判公告"},
{"url": "https://www.ccgp.gov.cn/cggg/zygg/xjgg/", "project_type": "询价公告"},
]
def __init__(
self,
timeout: int = 15,
request_delay: float = 1.0,
max_retries: int = 3,
use_browser_fallback: bool = True,
browser_headless: bool = True,
):
self.timeout = timeout
self.request_delay = request_delay
self.max_retries = max_retries
self.use_browser_fallback = use_browser_fallback
self.browser_headless = browser_headless
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
"Connection": "keep-alive",
}
)
self.diagnostics: List[Dict] = []
self.last_error: str = ""
def _record_diagnostic(self, event: str, **payload) -> None:
diagnostic = {
"event": event,
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
diagnostic.update(payload)
self.diagnostics.append(diagnostic)
def _sleep(self, seconds: Optional[float] = None) -> None:
time.sleep(self.request_delay if seconds is None else seconds)
def _is_blocked_response(self, html: str, title: str = "") -> Tuple[bool, str]:
signals = [
"频繁访问",
"访问过于频繁",
"操作过于频繁",
"禁止访问",
"访问异常",
"安全验证",
"403 Forbidden",
"抱歉,您的请求被阻断了",
"You are unable to access",
"Internal Server Error",
]
check_text = f"{title}\n{html[:3000]}"
for signal in signals:
if signal in check_text:
return True, signal
return False, ""
def _decode_response(self, response: requests.Response) -> str:
encodings = []
for encoding in ("utf-8", response.apparent_encoding, response.encoding, "gb18030"):
if encoding and encoding not in encodings:
encodings.append(encoding)
for encoding in encodings:
try:
return response.content.decode(encoding)
except UnicodeDecodeError:
continue
return response.text
def _request_html(
self,
url: str,
params: Optional[Dict] = None,
allow_browser_fallback: bool = True,
) -> Tuple[Optional[str], Optional[str], str]:
last_error = ""
for attempt in range(1, self.max_retries + 1):
try:
response = self.session.get(url, params=params, timeout=self.timeout)
html = self._decode_response(response)
soup = BeautifulSoup(html, "html.parser")
title = soup.title.get_text(strip=True) if soup.title else ""
blocked, signal = self._is_blocked_response(html, title)
if blocked:
last_error = f"site_blocked:{signal}"
self._record_diagnostic(
"blocked_response",
request_url=response.url,
status_code=response.status_code,
page_title=title,
signal=signal,
)
break
if response.status_code >= 400:
last_error = f"http_status:{response.status_code}"
self._record_diagnostic(
"http_error",
request_url=response.url,
status_code=response.status_code,
page_title=title,
)
else:
return html, title, response.url
except requests.RequestException as exc:
last_error = str(exc)
self._record_diagnostic(
"request_exception",
request_url=self._build_url(url, params),
error=str(exc),
attempt=attempt,
)
if attempt < self.max_retries:
self._sleep(min(2.0 * attempt, 5.0))
if allow_browser_fallback and self.use_browser_fallback:
browser_html, browser_title, browser_url, browser_error = self._request_html_by_browser(
self._build_url(url, params)
)
if browser_html:
return browser_html, browser_title, browser_url
if browser_error:
last_error = f"{last_error}; browser:{browser_error}" if last_error else browser_error
self.last_error = last_error
return None, None, self._build_url(url, params)
def _request_html_by_browser(self, url: str) -> Tuple[Optional[str], Optional[str], str, str]:
try:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
except Exception as exc:
error = f"selenium_unavailable:{exc}"
self._record_diagnostic("browser_unavailable", request_url=url, error=error)
return None, None, url, error
driver = None
try:
options = EdgeOptions()
if self.browser_headless:
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1600,1200")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--no-sandbox")
options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
)
driver = webdriver.Edge(options=options)
driver.set_page_load_timeout(self.timeout + 10)
driver.get(url)
WebDriverWait(driver, self.timeout).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
time.sleep(2)
html = driver.page_source
title = driver.title or ""
current_url = driver.current_url
blocked, signal = self._is_blocked_response(html, title)
if blocked:
error = f"site_blocked:{signal}"
self._record_diagnostic(
"browser_blocked_response",
request_url=current_url,
page_title=title,
signal=signal,
)
return None, None, current_url, error
self._record_diagnostic("browser_fallback_success", request_url=current_url, page_title=title)
return html, title, current_url, ""
except Exception as exc:
error = str(exc)
self._record_diagnostic("browser_exception", request_url=url, error=error)
return None, None, url, error
finally:
if driver is not None:
try:
driver.quit()
except Exception:
pass
def _build_url(self, url: str, params: Optional[Dict]) -> str:
if not params:
return url
return f"{url}?{urlencode(params, doseq=True)}"
def _build_search_url(self, params: Dict) -> str:
"""Build search URL with explicit UTF-8 encoding for the keyword."""
encoded_parts = []
for key, value in params.items():
if value is None:
value = ""
if isinstance(value, str):
encoded_value = quote(value, safe="", encoding="utf-8", errors="strict")
else:
encoded_value = quote(str(value), safe="", encoding="utf-8", errors="strict")
encoded_parts.append(f"{key}={encoded_value}")
return f"{self.SEARCH_URL}?{'&'.join(encoded_parts)}"
def _normalize_date(self, raw_date: str) -> str:
raw_date = raw_date.strip()
match = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})", raw_date)
if not match:
return raw_date
year, month, day = match.groups()
return f"{int(year):04d}-{int(month):02d}-{int(day):02d}"
def _normalize_datetime(self, raw_text: str) -> str:
raw_text = raw_text.strip()
match = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})(?:日)?\s*(\d{1,2}:\d{2})?", raw_text)
if not match:
return raw_text
year, month, day, time_part = match.groups()
date_part = f"{int(year):04d}-{int(month):02d}-{int(day):02d}"
return f"{date_part} {time_part}" if time_part else date_part
@staticmethod
def _build_std_timestamp(date_text: str) -> int:
"""Build a Unix timestamp for 08:00:00 Asia/Shanghai on the given date."""
if not date_text:
return 0
try:
beijing_tz = timezone(timedelta(hours=8))
dt = datetime.strptime(date_text, "%Y-%m-%d").replace(
hour=8,
minute=0,
second=0,
microsecond=0,
tzinfo=beijing_tz,
)
return int(dt.timestamp())
except ValueError:
return 0
def _extract_total(self, soup: BeautifulSoup) -> int:
text = soup.get_text(" ", strip=True)
patterns = [
r"\s*(\d+)\s*条",
r"\s*(\d+)\s*条",
r"找到\s*(\d+)\s*条",
]
for pattern in patterns:
match = re.search(pattern, text)
if match:
return int(match.group(1))
return 0
def _extract_result_nodes(self, soup: BeautifulSoup) -> List:
selectors = [
"ul.vT-srch-result-list-bid > li",
".vT-srch-result-list-bid li",
".vT-srch-result-list li",
".search-result li",
"li",
]
for selector in selectors:
nodes = soup.select(selector)
candidate_nodes = []
for node in nodes:
link = node.find("a", href=True)
if not link:
continue
href = link["href"].strip()
title = link.get_text(" ", strip=True)
if len(title) < 4:
continue
if "ccgp.gov.cn" not in href and not href.startswith("/"):
continue
candidate_nodes.append(node)
if candidate_nodes:
return candidate_nodes
return []
def _parse_search_items(self, soup: BeautifulSoup, keyword: str) -> List[Dict]:
items: List[Dict] = []
for node in self._extract_result_nodes(soup):
link = node.find("a", href=True)
if not link:
continue
title = re.sub(r"\s+", " ", link.get_text(" ", strip=True))
href = urljoin(self.SEARCH_URL, link["href"].strip())
node_text = re.sub(r"\s+", " ", node.get_text(" ", strip=True))
date_match = re.search(r"\d{4}[-/.年]\d{1,2}[-/.月]\d{1,2}", node_text)
date = self._normalize_date(date_match.group(0)) if date_match else ""
std_timestamp = self._build_std_timestamp(date)
region = ""
region_match = re.search(r"(?:地区|地域|行政区划)[:]?\s*([^\s|]+)", node_text)
if region_match:
region = region_match.group(1)
else:
span_texts = [span.get_text(" ", strip=True) for span in node.find_all(["span", "em"])]
for span_text in span_texts:
if "地区" in span_text or "地域" in span_text:
region = span_text.split(":")[-1].split("")[-1].strip()
break
project_type = ""
known_types = [
"公开招标公告",
"竞争性磋商公告",
"竞争性谈判公告",
"询价公告",
"成交公告",
"中标公告",
"更正公告",
"废标公告",
"单一来源公告",
"采购公告",
"招标公告",
]
for known_type in known_types:
if known_type in node_text:
project_type = known_type
break
items.append(
{
"title": title,
"url": href,
"date": date,
"std_timestamp": std_timestamp,
"project_type": project_type,
"region": region,
"keyword": keyword,
"source": self.SOURCE_NAME,
"crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"content": "",
}
)
return items
def _build_listing_page_url(self, base_url: str, page_index: int) -> str:
if page_index <= 1:
return base_url
return urljoin(base_url, f"index_{page_index - 1}.htm")
def _extract_listing_page_count(self, html: str) -> int:
match = re.search(r"Pager\(\{size:(\d+),\s*current:\d+,\s*prefix:'index',suffix:'htm'\}\)", html)
if not match:
return 1
return max(int(match.group(1)), 1)
def _parse_listing_items(self, html: str, page_url: str, keyword: str, project_type: str) -> List[Dict]:
soup = BeautifulSoup(html, "html.parser")
items: List[Dict] = []
for node in soup.select("ul.c_list_bid > li"):
link = node.find("a", href=True)
if not link:
continue
title = re.sub(r"\s+", " ", link.get_text(" ", strip=True))
href = urljoin(page_url, link["href"].strip())
meta_values = [em.get_text(" ", strip=True) for em in node.find_all("em")]
publish_time = self._normalize_datetime(meta_values[0]) if len(meta_values) > 0 else ""
region = meta_values[1] if len(meta_values) > 1 else ""
date = publish_time.split(" ")[0] if publish_time else ""
std_timestamp = self._build_std_timestamp(date)
items.append(
{
"title": title,
"url": href,
"date": date,
"std_timestamp": std_timestamp,
"project_type": project_type,
"region": region,
"keyword": keyword,
"source": self.SOURCE_NAME,
"crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"content": "",
}
)
return items
def _scan_listing_pages_for_keyword(
self,
keyword: str,
max_items: int = 20,
max_pages_per_category: int = 3,
) -> List[Dict]:
keyword = keyword.strip()
title_candidates: List[Dict] = []
related_candidates: List[Dict] = []
seen_urls = set()
transport_hints = [
keyword,
"交通",
"信号",
"道路",
"路口",
"绿波",
"信控",
"交警",
"公交",
"轨道",
"导航",
"地图",
"信号灯",
"红绿灯",
"诱导",
]
for source in self.LISTING_SOURCES:
total_pages = 1
for page_index in range(1, max_pages_per_category + 1):
page_url = self._build_listing_page_url(source["url"], page_index)
html, _, final_url = self._request_html(
page_url,
params=None,
allow_browser_fallback=False,
)
if not html:
continue
if page_index == 1:
total_pages = self._extract_listing_page_count(html)
items = self._parse_listing_items(
html=html,
page_url=final_url or page_url,
keyword=keyword,
project_type=source["project_type"],
)
for item in items:
if item["url"] in seen_urls:
continue
seen_urls.add(item["url"])
title = item["title"]
if keyword in title:
title_candidates.append(item)
elif any(hint and hint in title for hint in transport_hints):
related_candidates.append(item)
if page_index >= total_pages:
break
self._sleep(0.4)
results: List[Dict] = []
fetch_queue = title_candidates + related_candidates
for item in fetch_queue:
content = self.fetch_detail_content(item["url"])
item["content"] = content
haystack = f"{item['title']}\n{content}"
if keyword not in haystack:
continue
results.append(item)
if len(results) >= max_items:
break
self._sleep(0.4)
self._record_diagnostic(
"listing_fallback_scan",
keyword=keyword,
title_candidates=len(title_candidates),
related_candidates=len(related_candidates),
matched=len(results),
max_pages_per_category=max_pages_per_category,
)
return results
def search_by_keyword(self, keyword: str, page_index: int = 1, bid_type: int = 0) -> Dict:
params = {
"searchtype": 1,
"page_index": page_index,
"bidSort": 0,
"buyerName": "",
"projectId": "",
"pinMu": 0,
"bidType": bid_type,
"dbselect": "bidx",
"kw": keyword,
"start_time": "",
"end_time": "",
"timeType": 6,
"displayZone": "",
"zoneId": "",
"agentName": "",
}
search_url = self._build_search_url(params)
html, title, request_url = self._request_html(search_url, params=None)
if not html:
if "site_blocked" in self.last_error and page_index == 1:
fallback_items = self._scan_listing_pages_for_keyword(keyword, max_items=20)
return {
"keyword": keyword,
"page_index": page_index,
"total": len(fallback_items),
"items": fallback_items,
"success": True,
"blocked": False,
"message": (
"search_endpoint_blocked_used_listing_fallback"
if fallback_items
else "search_endpoint_blocked_listing_fallback_no_match"
),
"request_url": request_url,
"fallback_mode": "listing_scan",
}
if "site_blocked" in self.last_error and page_index > 1:
return {
"keyword": keyword,
"page_index": page_index,
"total": 0,
"items": [],
"success": True,
"blocked": False,
"message": "search_endpoint_blocked_stop_paging",
"request_url": request_url,
"fallback_mode": "listing_scan",
}
return {
"keyword": keyword,
"page_index": page_index,
"total": 0,
"items": [],
"success": False,
"blocked": "site_blocked" in self.last_error,
"message": self.last_error or "request_failed",
"request_url": request_url,
}
soup = BeautifulSoup(html, "html.parser")
items = self._parse_search_items(soup, keyword)
total = self._extract_total(soup)
if not items:
self._record_diagnostic(
"empty_search_parse",
keyword=keyword,
page_index=page_index,
request_url=request_url,
page_title=title or "",
)
return {
"keyword": keyword,
"page_index": page_index,
"total": total,
"items": items,
"success": True,
"blocked": False,
"message": "",
"request_url": request_url,
}
def fetch_detail_content(self, url: str) -> str:
html, _, request_url = self._request_html(url, params=None)
if not html:
self._record_diagnostic("detail_fetch_failed", request_url=request_url, error=self.last_error)
return ""
soup = BeautifulSoup(html, "html.parser")
selectors = [
"div.vF_detail_content_container",
"div.vF_detail_content",
"div#zoom",
"div.content",
"div.article",
"div.article-content",
"div.contxt",
"body",
]
content_node = None
for selector in selectors:
node = soup.select_one(selector)
if node and len(node.get_text(" ", strip=True)) > 100:
content_node = node
break
if content_node is None:
self._record_diagnostic("detail_parse_empty", request_url=request_url)
return ""
for tag in content_node.find_all(["script", "style", "noscript"]):
tag.decompose()
text = content_node.get_text("\n", strip=True)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def crawl_by_keywords(self, keywords: List[str], max_per_keyword: int = 30) -> List[Dict]:
all_results: List[Dict] = []
seen_urls = set()
for keyword in keywords:
print("\n" + "=" * 60)
print(f"正在搜索关键词: {keyword}")
print("=" * 60)
keyword_results: List[Dict] = []
page_index = 1
blocked = False
while len(keyword_results) < max_per_keyword:
print(f"[进度] 正在获取第 {page_index} 页...")
search_result = self.search_by_keyword(keyword, page_index=page_index)
if not search_result["success"]:
print(f"[失败] 搜索失败: {search_result['message']}")
blocked = search_result.get("blocked", False)
break
items = search_result["items"]
if not items:
print("[结束] 当前页未解析到结果")
break
print(f"[成功] 本页解析 {len(items)} 条结果(页面总数提示: {search_result['total']}")
for item in items:
if item["url"] in seen_urls:
continue
print(f" [抓取] {item['title'][:60]}")
item["content"] = self.fetch_detail_content(item["url"])
seen_urls.add(item["url"])
keyword_results.append(item)
all_results.append(item)
if len(keyword_results) >= max_per_keyword:
break
self._sleep()
if len(items) < 20:
break
page_index += 1
self._sleep(max(1.5, self.request_delay))
print(f"[完成] 关键词 '{keyword}' 共获取 {len(keyword_results)} 条结果")
if blocked:
print("[提示] 当前环境可能触发了政府采购网的频率限制,已记录诊断信息")
return all_results
def save_results(self, results: List[Dict], output_dir: str = "./data") -> str:
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = os.path.join(output_dir, f"ccgp_results_{timestamp}.json")
with open(output_path, "w", encoding="utf-8") as file:
json.dump(results, file, ensure_ascii=False, indent=2)
print(f"[保存] 结果已保存到: {output_path}")
return output_path
def save_diagnostics(self, output_dir: str = "./data") -> Optional[str]:
if not self.diagnostics:
return None
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = os.path.join(output_dir, f"ccgp_probe_{timestamp}.json")
payload = {
"source": self.SOURCE_NAME,
"test_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"success": False,
"items": [],
"diagnosis": self.diagnostics,
}
with open(output_path, "w", encoding="utf-8") as file:
json.dump(payload, file, ensure_ascii=False, indent=2)
print(f"[保存] 诊断信息已保存到: {output_path}")
return output_path
def save_to_vectordb(self, results: List[Dict], vector_db_dir: str = "./vector_db") -> None:
if not results:
print("[RAG] 无可入库的数据")
return
rag_items = []
for item in results:
content = item.get("content", "").strip()
if not content:
continue
rag_items.append(
{
"title": item.get("title", ""),
"content": f"标题: {item.get('title', '')}\n\n{content}",
"url": item.get("url", ""),
"source": item.get("source", self.SOURCE_NAME),
"std_timestamp": item.get("std_timestamp", 0),
}
)
if not rag_items:
print("[RAG] 抓取结果没有可用正文,跳过向量库写入")
return
processor = RAGProcessor(vector_db_dir=vector_db_dir)
processor.process_news(rag_items, upsert=True)
stats = processor.get_database_stats()
print(
f"[RAG] 向量数据库现有 {stats['unique_news']} 条新闻,"
f"{stats['total_documents']} 个文档片段"
)
def crawl_and_save(
self,
keywords: List[str],
max_per_keyword: int = 30,
output_dir: str = "./data",
save_to_rag: bool = False,
vector_db_dir: str = "./vector_db",
) -> List[Dict]:
results = self.crawl_by_keywords(keywords=keywords, max_per_keyword=max_per_keyword)
if results:
self.save_results(results, output_dir=output_dir)
if save_to_rag:
self.save_to_vectordb(results, vector_db_dir=vector_db_dir)
else:
self.save_diagnostics(output_dir=output_dir)
return results
if __name__ == "__main__":
crawler = CCGPCrawler()
items = crawler.crawl_and_save(
keywords=["信控", "绿波"],
max_per_keyword=10,
output_dir="./data",
save_to_rag=False,
)
print(f"\n爬取完成,共获取 {len(items)} 条结果")