""" 政府采购网爬虫集成示例 演示如何爬取政府采购信息并生成分析报表 """ import os from dotenv import load_dotenv from crawler.ccgp_crawler import CCGPCrawler from rag import RAGProcessor from report import ReportGenerator def main(): """主函数""" load_dotenv() # 获取配置 qwen_api_key = os.getenv("QWEN_API_KEY") qwen_model = os.getenv("QWEN_MODEL", "qwen-max") data_dir = os.getenv("DATA_DIR", "./data") vector_db_dir = os.getenv("VECTOR_DB_DIR", "./vector_db") print("="*60) print("政府采购信息分析系统 - 信控/绿波") print("="*60) # 步骤1: 爬取政府采购信息 print("\n[步骤 1/3] 爬取政府采购网信息...") ccgp_crawler = CCGPCrawler() # 搜索关键词 keywords = ["信控", "绿波"] results = ccgp_crawler.crawl_by_keywords( keywords=keywords, max_per_keyword=20 # 每个关键词最多20条 ) if not results: print("未能获取到采购信息") return print(f"[成功] 爬取了 {len(results)} 条采购信息") # 步骤2: 保存到向量数据库 print("\n[步骤 2/3] 将采购信息存入向量数据库...") rag_processor = RAGProcessor(vector_db_dir=vector_db_dir) # 转换格式并组合标题+内容 news_list = [] for item in results: if not item.get('content'): continue # 标题和内容一起嵌入向量数据库 combined_content = f"标题: {item['title']}\n\n{item['content']}" news_item = { "title": item["title"], "content": combined_content, "url": item["url"], "source": item["source"], "std_timestamp": item.get("std_timestamp", 0), } news_list.append(news_item) # 使用upsert模式存储 rag_processor.process_news(news_list, upsert=True) db_stats = rag_processor.get_database_stats() print(f"[成功] 向量数据库现有 {db_stats['unique_news']} 条新闻") # 步骤3: 生成分析报表 print("\n[步骤 3/3] 生成采购信息分析报表...") report_generator = ReportGenerator(api_key=qwen_api_key, model_name=qwen_model) # 检索相关内容增强报表 relevant_docs = rag_processor.search("信控 绿波 交通信号 采购", k=15) report = report_generator.generate_summary_report(news_list, relevant_docs) # 保存报表 report_path = report_generator.save_report( report, report_type='ccgp_xinlv_lvbo', output_dir=data_dir ) print(f"[成功] 报表已生成: {report_path}") # 打印报表预览 print("\n" + "="*60) print("报表内容预览:") print("="*60) print(report) # 统计信息 print("\n" + "="*60) print("统计信息:") print("="*60) print(f"总采购信息数: {len(results)}") # 按关键词统计 keyword_stats = {} for item in results: kw = item.get('keyword', '未知') keyword_stats[kw] = keyword_stats.get(kw, 0) + 1 print("\n按关键词统计:") for kw, count in keyword_stats.items(): print(f" - {kw}: {count} 条") # 按项目类型统计 type_stats = {} for item in results: ptype = item.get('project_type', '未知') if ptype: type_stats[ptype] = type_stats.get(ptype, 0) + 1 if type_stats: print("\n按项目类型统计:") for ptype, count in type_stats.items(): print(f" - {ptype}: {count} 条") print("\n" + "="*60) print("任务完成!") print("="*60) if __name__ == "__main__": main()