"""中国政府采购网爬虫。""" import json import os import re import time from datetime import datetime, timedelta, timezone from typing import Dict, List, Optional, Tuple from urllib.parse import quote, urlencode, urljoin import requests from bs4 import BeautifulSoup from rag import RAGProcessor class CCGPCrawler: """中国政府采购网关键词爬虫。""" SEARCH_URL = "https://search.ccgp.gov.cn/bxsearch" SOURCE_NAME = "中国政府采购网" LISTING_SOURCES = [ {"url": "https://www.ccgp.gov.cn/cggg/dfgg/gkzb/", "project_type": "公开招标公告"}, {"url": "https://www.ccgp.gov.cn/cggg/dfgg/zbgg/", "project_type": "中标公告"}, {"url": "https://www.ccgp.gov.cn/cggg/dfgg/cjgg/", "project_type": "成交公告"}, {"url": "https://www.ccgp.gov.cn/cggg/dfgg/jzxcs/", "project_type": "竞争性磋商公告"}, {"url": "https://www.ccgp.gov.cn/cggg/dfgg/jzxtpgg/", "project_type": "竞争性谈判公告"}, {"url": "https://www.ccgp.gov.cn/cggg/dfgg/xjgg/", "project_type": "询价公告"}, {"url": "https://www.ccgp.gov.cn/cggg/zygg/gkzb/", "project_type": "公开招标公告"}, {"url": "https://www.ccgp.gov.cn/cggg/zygg/zbgg/", "project_type": "中标公告"}, {"url": "https://www.ccgp.gov.cn/cggg/zygg/cjgg/", "project_type": "成交公告"}, {"url": "https://www.ccgp.gov.cn/cggg/zygg/jzxcs/", "project_type": "竞争性磋商公告"}, {"url": "https://www.ccgp.gov.cn/cggg/zygg/jzxtpgg/", "project_type": "竞争性谈判公告"}, {"url": "https://www.ccgp.gov.cn/cggg/zygg/xjgg/", "project_type": "询价公告"}, ] def __init__( self, timeout: int = 15, request_delay: float = 1.0, max_retries: int = 3, use_browser_fallback: bool = True, browser_headless: bool = True, ): self.timeout = timeout self.request_delay = request_delay self.max_retries = max_retries self.use_browser_fallback = use_browser_fallback self.browser_headless = browser_headless self.session = requests.Session() self.session.headers.update( { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Pragma": "no-cache", "Connection": "keep-alive", } ) self.diagnostics: List[Dict] = [] self.last_error: str = "" def _record_diagnostic(self, event: str, **payload) -> None: diagnostic = { "event": event, "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } diagnostic.update(payload) self.diagnostics.append(diagnostic) def _sleep(self, seconds: Optional[float] = None) -> None: time.sleep(self.request_delay if seconds is None else seconds) def _is_blocked_response(self, html: str, title: str = "") -> Tuple[bool, str]: signals = [ "频繁访问", "访问过于频繁", "操作过于频繁", "禁止访问", "访问异常", "安全验证", "403 Forbidden", "抱歉,您的请求被阻断了", "You are unable to access", "Internal Server Error", ] check_text = f"{title}\n{html[:3000]}" for signal in signals: if signal in check_text: return True, signal return False, "" def _decode_response(self, response: requests.Response) -> str: encodings = [] for encoding in ("utf-8", response.apparent_encoding, response.encoding, "gb18030"): if encoding and encoding not in encodings: encodings.append(encoding) for encoding in encodings: try: return response.content.decode(encoding) except UnicodeDecodeError: continue return response.text def _request_html( self, url: str, params: Optional[Dict] = None, allow_browser_fallback: bool = True, ) -> Tuple[Optional[str], Optional[str], str]: last_error = "" for attempt in range(1, self.max_retries + 1): try: response = self.session.get(url, params=params, timeout=self.timeout) html = self._decode_response(response) soup = BeautifulSoup(html, "html.parser") title = soup.title.get_text(strip=True) if soup.title else "" blocked, signal = self._is_blocked_response(html, title) if blocked: last_error = f"site_blocked:{signal}" self._record_diagnostic( "blocked_response", request_url=response.url, status_code=response.status_code, page_title=title, signal=signal, ) break if response.status_code >= 400: last_error = f"http_status:{response.status_code}" self._record_diagnostic( "http_error", request_url=response.url, status_code=response.status_code, page_title=title, ) else: return html, title, response.url except requests.RequestException as exc: last_error = str(exc) self._record_diagnostic( "request_exception", request_url=self._build_url(url, params), error=str(exc), attempt=attempt, ) if attempt < self.max_retries: self._sleep(min(2.0 * attempt, 5.0)) if allow_browser_fallback and self.use_browser_fallback: browser_html, browser_title, browser_url, browser_error = self._request_html_by_browser( self._build_url(url, params) ) if browser_html: return browser_html, browser_title, browser_url if browser_error: last_error = f"{last_error}; browser:{browser_error}" if last_error else browser_error self.last_error = last_error return None, None, self._build_url(url, params) def _request_html_by_browser(self, url: str) -> Tuple[Optional[str], Optional[str], str, str]: try: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.edge.options import Options as EdgeOptions from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait except Exception as exc: error = f"selenium_unavailable:{exc}" self._record_diagnostic("browser_unavailable", request_url=url, error=error) return None, None, url, error driver = None try: options = EdgeOptions() if self.browser_headless: options.add_argument("--headless=new") options.add_argument("--disable-gpu") options.add_argument("--window-size=1600,1200") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("--no-sandbox") options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0" ) driver = webdriver.Edge(options=options) driver.set_page_load_timeout(self.timeout + 10) driver.get(url) WebDriverWait(driver, self.timeout).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) time.sleep(2) html = driver.page_source title = driver.title or "" current_url = driver.current_url blocked, signal = self._is_blocked_response(html, title) if blocked: error = f"site_blocked:{signal}" self._record_diagnostic( "browser_blocked_response", request_url=current_url, page_title=title, signal=signal, ) return None, None, current_url, error self._record_diagnostic("browser_fallback_success", request_url=current_url, page_title=title) return html, title, current_url, "" except Exception as exc: error = str(exc) self._record_diagnostic("browser_exception", request_url=url, error=error) return None, None, url, error finally: if driver is not None: try: driver.quit() except Exception: pass def _build_url(self, url: str, params: Optional[Dict]) -> str: if not params: return url return f"{url}?{urlencode(params, doseq=True)}" def _build_search_url(self, params: Dict) -> str: """Build search URL with explicit UTF-8 encoding for the keyword.""" encoded_parts = [] for key, value in params.items(): if value is None: value = "" if isinstance(value, str): encoded_value = quote(value, safe="", encoding="utf-8", errors="strict") else: encoded_value = quote(str(value), safe="", encoding="utf-8", errors="strict") encoded_parts.append(f"{key}={encoded_value}") return f"{self.SEARCH_URL}?{'&'.join(encoded_parts)}" def _normalize_date(self, raw_date: str) -> str: raw_date = raw_date.strip() match = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})", raw_date) if not match: return raw_date year, month, day = match.groups() return f"{int(year):04d}-{int(month):02d}-{int(day):02d}" def _normalize_datetime(self, raw_text: str) -> str: raw_text = raw_text.strip() match = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})(?:日)?\s*(\d{1,2}:\d{2})?", raw_text) if not match: return raw_text year, month, day, time_part = match.groups() date_part = f"{int(year):04d}-{int(month):02d}-{int(day):02d}" return f"{date_part} {time_part}" if time_part else date_part @staticmethod def _build_std_timestamp(date_text: str) -> int: """Build a Unix timestamp for 08:00:00 Asia/Shanghai on the given date.""" if not date_text: return 0 try: beijing_tz = timezone(timedelta(hours=8)) dt = datetime.strptime(date_text, "%Y-%m-%d").replace( hour=8, minute=0, second=0, microsecond=0, tzinfo=beijing_tz, ) return int(dt.timestamp()) except ValueError: return 0 def _extract_total(self, soup: BeautifulSoup) -> int: text = soup.get_text(" ", strip=True) patterns = [ r"共\s*(\d+)\s*条", r"约\s*(\d+)\s*条", r"找到\s*(\d+)\s*条", ] for pattern in patterns: match = re.search(pattern, text) if match: return int(match.group(1)) return 0 def _extract_result_nodes(self, soup: BeautifulSoup) -> List: selectors = [ "ul.vT-srch-result-list-bid > li", ".vT-srch-result-list-bid li", ".vT-srch-result-list li", ".search-result li", "li", ] for selector in selectors: nodes = soup.select(selector) candidate_nodes = [] for node in nodes: link = node.find("a", href=True) if not link: continue href = link["href"].strip() title = link.get_text(" ", strip=True) if len(title) < 4: continue if "ccgp.gov.cn" not in href and not href.startswith("/"): continue candidate_nodes.append(node) if candidate_nodes: return candidate_nodes return [] def _parse_search_items(self, soup: BeautifulSoup, keyword: str) -> List[Dict]: items: List[Dict] = [] for node in self._extract_result_nodes(soup): link = node.find("a", href=True) if not link: continue title = re.sub(r"\s+", " ", link.get_text(" ", strip=True)) href = urljoin(self.SEARCH_URL, link["href"].strip()) node_text = re.sub(r"\s+", " ", node.get_text(" ", strip=True)) date_match = re.search(r"\d{4}[-/.年]\d{1,2}[-/.月]\d{1,2}", node_text) date = self._normalize_date(date_match.group(0)) if date_match else "" std_timestamp = self._build_std_timestamp(date) region = "" region_match = re.search(r"(?:地区|地域|行政区划)[::]?\s*([^\s|]+)", node_text) if region_match: region = region_match.group(1) else: span_texts = [span.get_text(" ", strip=True) for span in node.find_all(["span", "em"])] for span_text in span_texts: if "地区" in span_text or "地域" in span_text: region = span_text.split(":")[-1].split(":")[-1].strip() break project_type = "" known_types = [ "公开招标公告", "竞争性磋商公告", "竞争性谈判公告", "询价公告", "成交公告", "中标公告", "更正公告", "废标公告", "单一来源公告", "采购公告", "招标公告", ] for known_type in known_types: if known_type in node_text: project_type = known_type break items.append( { "title": title, "url": href, "date": date, "std_timestamp": std_timestamp, "project_type": project_type, "region": region, "keyword": keyword, "source": self.SOURCE_NAME, "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "content": "", } ) return items def _build_listing_page_url(self, base_url: str, page_index: int) -> str: if page_index <= 1: return base_url return urljoin(base_url, f"index_{page_index - 1}.htm") def _extract_listing_page_count(self, html: str) -> int: match = re.search(r"Pager\(\{size:(\d+),\s*current:\d+,\s*prefix:'index',suffix:'htm'\}\)", html) if not match: return 1 return max(int(match.group(1)), 1) def _parse_listing_items(self, html: str, page_url: str, keyword: str, project_type: str) -> List[Dict]: soup = BeautifulSoup(html, "html.parser") items: List[Dict] = [] for node in soup.select("ul.c_list_bid > li"): link = node.find("a", href=True) if not link: continue title = re.sub(r"\s+", " ", link.get_text(" ", strip=True)) href = urljoin(page_url, link["href"].strip()) meta_values = [em.get_text(" ", strip=True) for em in node.find_all("em")] publish_time = self._normalize_datetime(meta_values[0]) if len(meta_values) > 0 else "" region = meta_values[1] if len(meta_values) > 1 else "" date = publish_time.split(" ")[0] if publish_time else "" std_timestamp = self._build_std_timestamp(date) items.append( { "title": title, "url": href, "date": date, "std_timestamp": std_timestamp, "project_type": project_type, "region": region, "keyword": keyword, "source": self.SOURCE_NAME, "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "content": "", } ) return items def _scan_listing_pages_for_keyword( self, keyword: str, max_items: int = 20, max_pages_per_category: int = 3, ) -> List[Dict]: keyword = keyword.strip() title_candidates: List[Dict] = [] related_candidates: List[Dict] = [] seen_urls = set() transport_hints = [ keyword, "交通", "信号", "道路", "路口", "绿波", "信控", "交警", "公交", "轨道", "导航", "地图", "信号灯", "红绿灯", "诱导", ] for source in self.LISTING_SOURCES: total_pages = 1 for page_index in range(1, max_pages_per_category + 1): page_url = self._build_listing_page_url(source["url"], page_index) html, _, final_url = self._request_html( page_url, params=None, allow_browser_fallback=False, ) if not html: continue if page_index == 1: total_pages = self._extract_listing_page_count(html) items = self._parse_listing_items( html=html, page_url=final_url or page_url, keyword=keyword, project_type=source["project_type"], ) for item in items: if item["url"] in seen_urls: continue seen_urls.add(item["url"]) title = item["title"] if keyword in title: title_candidates.append(item) elif any(hint and hint in title for hint in transport_hints): related_candidates.append(item) if page_index >= total_pages: break self._sleep(0.4) results: List[Dict] = [] fetch_queue = title_candidates + related_candidates for item in fetch_queue: content = self.fetch_detail_content(item["url"]) item["content"] = content haystack = f"{item['title']}\n{content}" if keyword not in haystack: continue results.append(item) if len(results) >= max_items: break self._sleep(0.4) self._record_diagnostic( "listing_fallback_scan", keyword=keyword, title_candidates=len(title_candidates), related_candidates=len(related_candidates), matched=len(results), max_pages_per_category=max_pages_per_category, ) return results def search_by_keyword(self, keyword: str, page_index: int = 1, bid_type: int = 0) -> Dict: params = { "searchtype": 1, "page_index": page_index, "bidSort": 0, "buyerName": "", "projectId": "", "pinMu": 0, "bidType": bid_type, "dbselect": "bidx", "kw": keyword, "start_time": "", "end_time": "", "timeType": 6, "displayZone": "", "zoneId": "", "agentName": "", } search_url = self._build_search_url(params) html, title, request_url = self._request_html(search_url, params=None) if not html: if "site_blocked" in self.last_error and page_index == 1: fallback_items = self._scan_listing_pages_for_keyword(keyword, max_items=20) return { "keyword": keyword, "page_index": page_index, "total": len(fallback_items), "items": fallback_items, "success": True, "blocked": False, "message": ( "search_endpoint_blocked_used_listing_fallback" if fallback_items else "search_endpoint_blocked_listing_fallback_no_match" ), "request_url": request_url, "fallback_mode": "listing_scan", } if "site_blocked" in self.last_error and page_index > 1: return { "keyword": keyword, "page_index": page_index, "total": 0, "items": [], "success": True, "blocked": False, "message": "search_endpoint_blocked_stop_paging", "request_url": request_url, "fallback_mode": "listing_scan", } return { "keyword": keyword, "page_index": page_index, "total": 0, "items": [], "success": False, "blocked": "site_blocked" in self.last_error, "message": self.last_error or "request_failed", "request_url": request_url, } soup = BeautifulSoup(html, "html.parser") items = self._parse_search_items(soup, keyword) total = self._extract_total(soup) if not items: self._record_diagnostic( "empty_search_parse", keyword=keyword, page_index=page_index, request_url=request_url, page_title=title or "", ) return { "keyword": keyword, "page_index": page_index, "total": total, "items": items, "success": True, "blocked": False, "message": "", "request_url": request_url, } def fetch_detail_content(self, url: str) -> str: html, _, request_url = self._request_html(url, params=None) if not html: self._record_diagnostic("detail_fetch_failed", request_url=request_url, error=self.last_error) return "" soup = BeautifulSoup(html, "html.parser") selectors = [ "div.vF_detail_content_container", "div.vF_detail_content", "div#zoom", "div.content", "div.article", "div.article-content", "div.contxt", "body", ] content_node = None for selector in selectors: node = soup.select_one(selector) if node and len(node.get_text(" ", strip=True)) > 100: content_node = node break if content_node is None: self._record_diagnostic("detail_parse_empty", request_url=request_url) return "" for tag in content_node.find_all(["script", "style", "noscript"]): tag.decompose() text = content_node.get_text("\n", strip=True) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def crawl_by_keywords(self, keywords: List[str], max_per_keyword: int = 30) -> List[Dict]: all_results: List[Dict] = [] seen_urls = set() for keyword in keywords: print("\n" + "=" * 60) print(f"正在搜索关键词: {keyword}") print("=" * 60) keyword_results: List[Dict] = [] page_index = 1 blocked = False while len(keyword_results) < max_per_keyword: print(f"[进度] 正在获取第 {page_index} 页...") search_result = self.search_by_keyword(keyword, page_index=page_index) if not search_result["success"]: print(f"[失败] 搜索失败: {search_result['message']}") blocked = search_result.get("blocked", False) break items = search_result["items"] if not items: print("[结束] 当前页未解析到结果") break print(f"[成功] 本页解析 {len(items)} 条结果(页面总数提示: {search_result['total']})") for item in items: if item["url"] in seen_urls: continue print(f" [抓取] {item['title'][:60]}") item["content"] = self.fetch_detail_content(item["url"]) seen_urls.add(item["url"]) keyword_results.append(item) all_results.append(item) if len(keyword_results) >= max_per_keyword: break self._sleep() if len(items) < 20: break page_index += 1 self._sleep(max(1.5, self.request_delay)) print(f"[完成] 关键词 '{keyword}' 共获取 {len(keyword_results)} 条结果") if blocked: print("[提示] 当前环境可能触发了政府采购网的频率限制,已记录诊断信息") return all_results def save_results(self, results: List[Dict], output_dir: str = "./data") -> str: os.makedirs(output_dir, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = os.path.join(output_dir, f"ccgp_results_{timestamp}.json") with open(output_path, "w", encoding="utf-8") as file: json.dump(results, file, ensure_ascii=False, indent=2) print(f"[保存] 结果已保存到: {output_path}") return output_path def save_diagnostics(self, output_dir: str = "./data") -> Optional[str]: if not self.diagnostics: return None os.makedirs(output_dir, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = os.path.join(output_dir, f"ccgp_probe_{timestamp}.json") payload = { "source": self.SOURCE_NAME, "test_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "success": False, "items": [], "diagnosis": self.diagnostics, } with open(output_path, "w", encoding="utf-8") as file: json.dump(payload, file, ensure_ascii=False, indent=2) print(f"[保存] 诊断信息已保存到: {output_path}") return output_path def save_to_vectordb(self, results: List[Dict], vector_db_dir: str = "./vector_db") -> None: if not results: print("[RAG] 无可入库的数据") return rag_items = [] for item in results: content = item.get("content", "").strip() if not content: continue rag_items.append( { "title": item.get("title", ""), "content": f"标题: {item.get('title', '')}\n\n{content}", "url": item.get("url", ""), "source": item.get("source", self.SOURCE_NAME), "std_timestamp": item.get("std_timestamp", 0), } ) if not rag_items: print("[RAG] 抓取结果没有可用正文,跳过向量库写入") return processor = RAGProcessor(vector_db_dir=vector_db_dir) processor.process_news(rag_items, upsert=True) stats = processor.get_database_stats() print( f"[RAG] 向量数据库现有 {stats['unique_news']} 条新闻," f"{stats['total_documents']} 个文档片段" ) def crawl_and_save( self, keywords: List[str], max_per_keyword: int = 30, output_dir: str = "./data", save_to_rag: bool = False, vector_db_dir: str = "./vector_db", ) -> List[Dict]: results = self.crawl_by_keywords(keywords=keywords, max_per_keyword=max_per_keyword) if results: self.save_results(results, output_dir=output_dir) if save_to_rag: self.save_to_vectordb(results, vector_db_dir=vector_db_dir) else: self.save_diagnostics(output_dir=output_dir) return results if __name__ == "__main__": crawler = CCGPCrawler() items = crawler.crawl_and_save( keywords=["信控", "绿波"], max_per_keyword=10, output_dir="./data", save_to_rag=False, ) print(f"\n爬取完成,共获取 {len(items)} 条结果")