newsreport_agent_for_traffic/crawler/ccgp_crawler.py

"""中国政府采购网爬虫。"""
import json
import os
import re
import time
from datetime import datetime, timedelta, timezone
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote, urlencode, urljoin

import requests
from bs4 import BeautifulSoup

from rag import RAGProcessor


class CCGPCrawler:
    """中国政府采购网关键词爬虫。"""

    SEARCH_URL = "https://search.ccgp.gov.cn/bxsearch"
    SOURCE_NAME = "中国政府采购网"
    LISTING_SOURCES = [
        {"url": "https://www.ccgp.gov.cn/cggg/dfgg/gkzb/", "project_type": "公开招标公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/dfgg/zbgg/", "project_type": "中标公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/dfgg/cjgg/", "project_type": "成交公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/dfgg/jzxcs/", "project_type": "竞争性磋商公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/dfgg/jzxtpgg/", "project_type": "竞争性谈判公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/dfgg/xjgg/", "project_type": "询价公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/zygg/gkzb/", "project_type": "公开招标公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/zygg/zbgg/", "project_type": "中标公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/zygg/cjgg/", "project_type": "成交公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/zygg/jzxcs/", "project_type": "竞争性磋商公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/zygg/jzxtpgg/", "project_type": "竞争性谈判公告"},
        {"url": "https://www.ccgp.gov.cn/cggg/zygg/xjgg/", "project_type": "询价公告"},
    ]

    def __init__(
        self,
        timeout: int = 15,
        request_delay: float = 1.0,
        max_retries: int = 3,
        use_browser_fallback: bool = True,
        browser_headless: bool = True,
    ):
        self.timeout = timeout
        self.request_delay = request_delay
        self.max_retries = max_retries
        self.use_browser_fallback = use_browser_fallback
        self.browser_headless = browser_headless
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": (
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
                ),
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
                "Cache-Control": "no-cache",
                "Pragma": "no-cache",
                "Connection": "keep-alive",
            }
        )
        self.diagnostics: List[Dict] = []
        self.last_error: str = ""

    def _record_diagnostic(self, event: str, **payload) -> None:
        diagnostic = {
            "event": event,
            "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        }
        diagnostic.update(payload)
        self.diagnostics.append(diagnostic)

    def _sleep(self, seconds: Optional[float] = None) -> None:
        time.sleep(self.request_delay if seconds is None else seconds)

    def _is_blocked_response(self, html: str, title: str = "") -> Tuple[bool, str]:
        signals = [
            "频繁访问",
            "访问过于频繁",
            "操作过于频繁",
            "禁止访问",
            "访问异常",
            "安全验证",
            "403 Forbidden",
            "抱歉，您的请求被阻断了",
            "You are unable to access",
            "Internal Server Error",
        ]
        check_text = f"{title}\n{html[:3000]}"
        for signal in signals:
            if signal in check_text:
                return True, signal
        return False, ""

    def _decode_response(self, response: requests.Response) -> str:
        encodings = []
        for encoding in ("utf-8", response.apparent_encoding, response.encoding, "gb18030"):
            if encoding and encoding not in encodings:
                encodings.append(encoding)

        for encoding in encodings:
            try:
                return response.content.decode(encoding)
            except UnicodeDecodeError:
                continue

        return response.text

    def _request_html(
        self,
        url: str,
        params: Optional[Dict] = None,
        allow_browser_fallback: bool = True,
    ) -> Tuple[Optional[str], Optional[str], str]:
        last_error = ""
        for attempt in range(1, self.max_retries + 1):
            try:
                response = self.session.get(url, params=params, timeout=self.timeout)
                html = self._decode_response(response)
                soup = BeautifulSoup(html, "html.parser")
                title = soup.title.get_text(strip=True) if soup.title else ""

                blocked, signal = self._is_blocked_response(html, title)
                if blocked:
                    last_error = f"site_blocked:{signal}"
                    self._record_diagnostic(
                        "blocked_response",
                        request_url=response.url,
                        status_code=response.status_code,
                        page_title=title,
                        signal=signal,
                    )
                    break

                if response.status_code >= 400:
                    last_error = f"http_status:{response.status_code}"
                    self._record_diagnostic(
                        "http_error",
                        request_url=response.url,
                        status_code=response.status_code,
                        page_title=title,
                    )
                else:
                    return html, title, response.url
            except requests.RequestException as exc:
                last_error = str(exc)
                self._record_diagnostic(
                    "request_exception",
                    request_url=self._build_url(url, params),
                    error=str(exc),
                    attempt=attempt,
                )

            if attempt < self.max_retries:
                self._sleep(min(2.0 * attempt, 5.0))

        if allow_browser_fallback and self.use_browser_fallback:
            browser_html, browser_title, browser_url, browser_error = self._request_html_by_browser(
                self._build_url(url, params)
            )
            if browser_html:
                return browser_html, browser_title, browser_url
            if browser_error:
                last_error = f"{last_error}; browser:{browser_error}" if last_error else browser_error

        self.last_error = last_error
        return None, None, self._build_url(url, params)

    def _request_html_by_browser(self, url: str) -> Tuple[Optional[str], Optional[str], str, str]:
        try:
            from selenium import webdriver
            from selenium.webdriver.common.by import By
            from selenium.webdriver.edge.options import Options as EdgeOptions
            from selenium.webdriver.support import expected_conditions as EC
            from selenium.webdriver.support.ui import WebDriverWait
        except Exception as exc:
            error = f"selenium_unavailable:{exc}"
            self._record_diagnostic("browser_unavailable", request_url=url, error=error)
            return None, None, url, error

        driver = None
        try:
            options = EdgeOptions()
            if self.browser_headless:
                options.add_argument("--headless=new")
            options.add_argument("--disable-gpu")
            options.add_argument("--window-size=1600,1200")
            options.add_argument("--disable-blink-features=AutomationControlled")
            options.add_argument("--no-sandbox")
            options.add_argument(
                "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
            )

            driver = webdriver.Edge(options=options)
            driver.set_page_load_timeout(self.timeout + 10)
            driver.get(url)
            WebDriverWait(driver, self.timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            time.sleep(2)
            html = driver.page_source
            title = driver.title or ""
            current_url = driver.current_url

            blocked, signal = self._is_blocked_response(html, title)
            if blocked:
                error = f"site_blocked:{signal}"
                self._record_diagnostic(
                    "browser_blocked_response",
                    request_url=current_url,
                    page_title=title,
                    signal=signal,
                )
                return None, None, current_url, error

            self._record_diagnostic("browser_fallback_success", request_url=current_url, page_title=title)
            return html, title, current_url, ""
        except Exception as exc:
            error = str(exc)
            self._record_diagnostic("browser_exception", request_url=url, error=error)
            return None, None, url, error
        finally:
            if driver is not None:
                try:
                    driver.quit()
                except Exception:
                    pass

    def _build_url(self, url: str, params: Optional[Dict]) -> str:
        if not params:
            return url
        return f"{url}?{urlencode(params, doseq=True)}"

    def _build_search_url(self, params: Dict) -> str:
        """Build search URL with explicit UTF-8 encoding for the keyword."""
        encoded_parts = []
        for key, value in params.items():
            if value is None:
                value = ""
            if isinstance(value, str):
                encoded_value = quote(value, safe="", encoding="utf-8", errors="strict")
            else:
                encoded_value = quote(str(value), safe="", encoding="utf-8", errors="strict")
            encoded_parts.append(f"{key}={encoded_value}")
        return f"{self.SEARCH_URL}?{'&'.join(encoded_parts)}"

    def _normalize_date(self, raw_date: str) -> str:
        raw_date = raw_date.strip()
        match = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})", raw_date)
        if not match:
            return raw_date
        year, month, day = match.groups()
        return f"{int(year):04d}-{int(month):02d}-{int(day):02d}"

    def _normalize_datetime(self, raw_text: str) -> str:
        raw_text = raw_text.strip()
        match = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})(?:日)?\s*(\d{1,2}:\d{2})?", raw_text)
        if not match:
            return raw_text
        year, month, day, time_part = match.groups()
        date_part = f"{int(year):04d}-{int(month):02d}-{int(day):02d}"
        return f"{date_part} {time_part}" if time_part else date_part

    @staticmethod
    def _build_std_timestamp(date_text: str) -> int:
        """Build a Unix timestamp for 08:00:00 Asia/Shanghai on the given date."""
        if not date_text:
            return 0
        try:
            beijing_tz = timezone(timedelta(hours=8))
            dt = datetime.strptime(date_text, "%Y-%m-%d").replace(
                hour=8,
                minute=0,
                second=0,
                microsecond=0,
                tzinfo=beijing_tz,
            )
            return int(dt.timestamp())
        except ValueError:
            return 0

    def _extract_total(self, soup: BeautifulSoup) -> int:
        text = soup.get_text(" ", strip=True)
        patterns = [
            r"共\s*(\d+)\s*条",
            r"约\s*(\d+)\s*条",
            r"找到\s*(\d+)\s*条",
        ]
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return 0

    def _extract_result_nodes(self, soup: BeautifulSoup) -> List:
        selectors = [
            "ul.vT-srch-result-list-bid > li",
            ".vT-srch-result-list-bid li",
            ".vT-srch-result-list li",
            ".search-result li",
            "li",
        ]
        for selector in selectors:
            nodes = soup.select(selector)
            candidate_nodes = []
            for node in nodes:
                link = node.find("a", href=True)
                if not link:
                    continue
                href = link["href"].strip()
                title = link.get_text(" ", strip=True)
                if len(title) < 4:
                    continue
                if "ccgp.gov.cn" not in href and not href.startswith("/"):
                    continue
                candidate_nodes.append(node)
            if candidate_nodes:
                return candidate_nodes
        return []

    def _parse_search_items(self, soup: BeautifulSoup, keyword: str) -> List[Dict]:
        items: List[Dict] = []
        for node in self._extract_result_nodes(soup):
            link = node.find("a", href=True)
            if not link:
                continue

            title = re.sub(r"\s+", " ", link.get_text(" ", strip=True))
            href = urljoin(self.SEARCH_URL, link["href"].strip())
            node_text = re.sub(r"\s+", " ", node.get_text(" ", strip=True))

            date_match = re.search(r"\d{4}[-/.年]\d{1,2}[-/.月]\d{1,2}", node_text)
            date = self._normalize_date(date_match.group(0)) if date_match else ""
            std_timestamp = self._build_std_timestamp(date)

            region = ""
            region_match = re.search(r"(?:地区|地域|行政区划)[:：]?\s*([^\s|]+)", node_text)
            if region_match:
                region = region_match.group(1)
            else:
                span_texts = [span.get_text(" ", strip=True) for span in node.find_all(["span", "em"])]
                for span_text in span_texts:
                    if "地区" in span_text or "地域" in span_text:
                        region = span_text.split(":")[-1].split("：")[-1].strip()
                        break

            project_type = ""
            known_types = [
                "公开招标公告",
                "竞争性磋商公告",
                "竞争性谈判公告",
                "询价公告",
                "成交公告",
                "中标公告",
                "更正公告",
                "废标公告",
                "单一来源公告",
                "采购公告",
                "招标公告",
            ]
            for known_type in known_types:
                if known_type in node_text:
                    project_type = known_type
                    break

            items.append(
                {
                    "title": title,
                    "url": href,
                    "date": date,
                    "std_timestamp": std_timestamp,
                    "project_type": project_type,
                    "region": region,
                    "keyword": keyword,
                    "source": self.SOURCE_NAME,
                    "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "content": "",
                }
            )
        return items

    def _build_listing_page_url(self, base_url: str, page_index: int) -> str:
        if page_index <= 1:
            return base_url
        return urljoin(base_url, f"index_{page_index - 1}.htm")

    def _extract_listing_page_count(self, html: str) -> int:
        match = re.search(r"Pager\(\{size:(\d+),\s*current:\d+,\s*prefix:'index',suffix:'htm'\}\)", html)
        if not match:
            return 1
        return max(int(match.group(1)), 1)

    def _parse_listing_items(self, html: str, page_url: str, keyword: str, project_type: str) -> List[Dict]:
        soup = BeautifulSoup(html, "html.parser")
        items: List[Dict] = []
        for node in soup.select("ul.c_list_bid > li"):
            link = node.find("a", href=True)
            if not link:
                continue

            title = re.sub(r"\s+", " ", link.get_text(" ", strip=True))
            href = urljoin(page_url, link["href"].strip())
            meta_values = [em.get_text(" ", strip=True) for em in node.find_all("em")]
            publish_time = self._normalize_datetime(meta_values[0]) if len(meta_values) > 0 else ""
            region = meta_values[1] if len(meta_values) > 1 else ""
            date = publish_time.split(" ")[0] if publish_time else ""
            std_timestamp = self._build_std_timestamp(date)

            items.append(
                {
                    "title": title,
                    "url": href,
                    "date": date,
                    "std_timestamp": std_timestamp,
                    "project_type": project_type,
                    "region": region,
                    "keyword": keyword,
                    "source": self.SOURCE_NAME,
                    "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "content": "",
                }
            )
        return items

    def _scan_listing_pages_for_keyword(
        self,
        keyword: str,
        max_items: int = 20,
        max_pages_per_category: int = 3,
    ) -> List[Dict]:
        keyword = keyword.strip()
        title_candidates: List[Dict] = []
        related_candidates: List[Dict] = []
        seen_urls = set()
        transport_hints = [
            keyword,
            "交通",
            "信号",
            "道路",
            "路口",
            "绿波",
            "信控",
            "交警",
            "公交",
            "轨道",
            "导航",
            "地图",
            "信号灯",
            "红绿灯",
            "诱导",
        ]

        for source in self.LISTING_SOURCES:
            total_pages = 1
            for page_index in range(1, max_pages_per_category + 1):
                page_url = self._build_listing_page_url(source["url"], page_index)
                html, _, final_url = self._request_html(
                    page_url,
                    params=None,
                    allow_browser_fallback=False,
                )
                if not html:
                    continue

                if page_index == 1:
                    total_pages = self._extract_listing_page_count(html)

                items = self._parse_listing_items(
                    html=html,
                    page_url=final_url or page_url,
                    keyword=keyword,
                    project_type=source["project_type"],
                )
                for item in items:
                    if item["url"] in seen_urls:
                        continue
                    seen_urls.add(item["url"])

                    title = item["title"]
                    if keyword in title:
                        title_candidates.append(item)
                    elif any(hint and hint in title for hint in transport_hints):
                        related_candidates.append(item)

                if page_index >= total_pages:
                    break
                self._sleep(0.4)

        results: List[Dict] = []
        fetch_queue = title_candidates + related_candidates
        for item in fetch_queue:
            content = self.fetch_detail_content(item["url"])
            item["content"] = content
            haystack = f"{item['title']}\n{content}"
            if keyword not in haystack:
                continue
            results.append(item)
            if len(results) >= max_items:
                break
            self._sleep(0.4)

        self._record_diagnostic(
            "listing_fallback_scan",
            keyword=keyword,
            title_candidates=len(title_candidates),
            related_candidates=len(related_candidates),
            matched=len(results),
            max_pages_per_category=max_pages_per_category,
        )
        return results

    def search_by_keyword(self, keyword: str, page_index: int = 1, bid_type: int = 0) -> Dict:
        params = {
            "searchtype": 1,
            "page_index": page_index,
            "bidSort": 0,
            "buyerName": "",
            "projectId": "",
            "pinMu": 0,
            "bidType": bid_type,
            "dbselect": "bidx",
            "kw": keyword,
            "start_time": "",
            "end_time": "",
            "timeType": 6,
            "displayZone": "",
            "zoneId": "",
            "agentName": "",
        }

        search_url = self._build_search_url(params)
        html, title, request_url = self._request_html(search_url, params=None)
        if not html:
            if "site_blocked" in self.last_error and page_index == 1:
                fallback_items = self._scan_listing_pages_for_keyword(keyword, max_items=20)
                return {
                    "keyword": keyword,
                    "page_index": page_index,
                    "total": len(fallback_items),
                    "items": fallback_items,
                    "success": True,
                    "blocked": False,
                    "message": (
                        "search_endpoint_blocked_used_listing_fallback"
                        if fallback_items
                        else "search_endpoint_blocked_listing_fallback_no_match"
                    ),
                    "request_url": request_url,
                    "fallback_mode": "listing_scan",
                }
            if "site_blocked" in self.last_error and page_index > 1:
                return {
                    "keyword": keyword,
                    "page_index": page_index,
                    "total": 0,
                    "items": [],
                    "success": True,
                    "blocked": False,
                    "message": "search_endpoint_blocked_stop_paging",
                    "request_url": request_url,
                    "fallback_mode": "listing_scan",
                }
            return {
                "keyword": keyword,
                "page_index": page_index,
                "total": 0,
                "items": [],
                "success": False,
                "blocked": "site_blocked" in self.last_error,
                "message": self.last_error or "request_failed",
                "request_url": request_url,
            }

        soup = BeautifulSoup(html, "html.parser")
        items = self._parse_search_items(soup, keyword)
        total = self._extract_total(soup)

        if not items:
            self._record_diagnostic(
                "empty_search_parse",
                keyword=keyword,
                page_index=page_index,
                request_url=request_url,
                page_title=title or "",
            )

        return {
            "keyword": keyword,
            "page_index": page_index,
            "total": total,
            "items": items,
            "success": True,
            "blocked": False,
            "message": "",
            "request_url": request_url,
        }

    def fetch_detail_content(self, url: str) -> str:
        html, _, request_url = self._request_html(url, params=None)
        if not html:
            self._record_diagnostic("detail_fetch_failed", request_url=request_url, error=self.last_error)
            return ""

        soup = BeautifulSoup(html, "html.parser")
        selectors = [
            "div.vF_detail_content_container",
            "div.vF_detail_content",
            "div#zoom",
            "div.content",
            "div.article",
            "div.article-content",
            "div.contxt",
            "body",
        ]

        content_node = None
        for selector in selectors:
            node = soup.select_one(selector)
            if node and len(node.get_text(" ", strip=True)) > 100:
                content_node = node
                break

        if content_node is None:
            self._record_diagnostic("detail_parse_empty", request_url=request_url)
            return ""

        for tag in content_node.find_all(["script", "style", "noscript"]):
            tag.decompose()

        text = content_node.get_text("\n", strip=True)
        text = re.sub(r"\n{3,}", "\n\n", text)
        return text.strip()

    def crawl_by_keywords(self, keywords: List[str], max_per_keyword: int = 30) -> List[Dict]:
        all_results: List[Dict] = []
        seen_urls = set()

        for keyword in keywords:
            print("\n" + "=" * 60)
            print(f"正在搜索关键词: {keyword}")
            print("=" * 60)

            keyword_results: List[Dict] = []
            page_index = 1
            blocked = False

            while len(keyword_results) < max_per_keyword:
                print(f"[进度] 正在获取第 {page_index} 页...")
                search_result = self.search_by_keyword(keyword, page_index=page_index)

                if not search_result["success"]:
                    print(f"[失败] 搜索失败: {search_result['message']}")
                    blocked = search_result.get("blocked", False)
                    break

                items = search_result["items"]
                if not items:
                    print("[结束] 当前页未解析到结果")
                    break

                print(f"[成功] 本页解析 {len(items)} 条结果（页面总数提示: {search_result['total']}）")

                for item in items:
                    if item["url"] in seen_urls:
                        continue

                    print(f"  [抓取] {item['title'][:60]}")
                    item["content"] = self.fetch_detail_content(item["url"])
                    seen_urls.add(item["url"])
                    keyword_results.append(item)
                    all_results.append(item)

                    if len(keyword_results) >= max_per_keyword:
                        break
                    self._sleep()

                if len(items) < 20:
                    break

                page_index += 1
                self._sleep(max(1.5, self.request_delay))

            print(f"[完成] 关键词 '{keyword}' 共获取 {len(keyword_results)} 条结果")
            if blocked:
                print("[提示] 当前环境可能触发了政府采购网的频率限制，已记录诊断信息")

        return all_results

    def save_results(self, results: List[Dict], output_dir: str = "./data") -> str:
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = os.path.join(output_dir, f"ccgp_results_{timestamp}.json")
        with open(output_path, "w", encoding="utf-8") as file:
            json.dump(results, file, ensure_ascii=False, indent=2)
        print(f"[保存] 结果已保存到: {output_path}")
        return output_path

    def save_diagnostics(self, output_dir: str = "./data") -> Optional[str]:
        if not self.diagnostics:
            return None
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = os.path.join(output_dir, f"ccgp_probe_{timestamp}.json")
        payload = {
            "source": self.SOURCE_NAME,
            "test_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "success": False,
            "items": [],
            "diagnosis": self.diagnostics,
        }
        with open(output_path, "w", encoding="utf-8") as file:
            json.dump(payload, file, ensure_ascii=False, indent=2)
        print(f"[保存] 诊断信息已保存到: {output_path}")
        return output_path

    def save_to_vectordb(self, results: List[Dict], vector_db_dir: str = "./vector_db") -> None:
        if not results:
            print("[RAG] 无可入库的数据")
            return

        rag_items = []
        for item in results:
            content = item.get("content", "").strip()
            if not content:
                continue
            rag_items.append(
                {
                    "title": item.get("title", ""),
                    "content": f"标题: {item.get('title', '')}\n\n{content}",
                    "url": item.get("url", ""),
                    "source": item.get("source", self.SOURCE_NAME),
                    "std_timestamp": item.get("std_timestamp", 0),
                }
            )

        if not rag_items:
            print("[RAG] 抓取结果没有可用正文，跳过向量库写入")
            return

        processor = RAGProcessor(vector_db_dir=vector_db_dir)
        processor.process_news(rag_items, upsert=True)
        stats = processor.get_database_stats()
        print(
            f"[RAG] 向量数据库现有 {stats['unique_news']} 条新闻，"
            f"{stats['total_documents']} 个文档片段"
        )

    def crawl_and_save(
        self,
        keywords: List[str],
        max_per_keyword: int = 30,
        output_dir: str = "./data",
        save_to_rag: bool = False,
        vector_db_dir: str = "./vector_db",
    ) -> List[Dict]:
        results = self.crawl_by_keywords(keywords=keywords, max_per_keyword=max_per_keyword)

        if results:
            self.save_results(results, output_dir=output_dir)
            if save_to_rag:
                self.save_to_vectordb(results, vector_db_dir=vector_db_dir)
        else:
            self.save_diagnostics(output_dir=output_dir)

        return results


if __name__ == "__main__":
    crawler = CCGPCrawler()
    items = crawler.crawl_and_save(
        keywords=["信控", "绿波"],
        max_per_keyword=10,
        output_dir="./data",
        save_to_rag=False,
    )
    print(f"\n爬取完成，共获取 {len(items)} 条结果")