""" 微信公众号爬虫集成示例 演示如何将微信文章爬取、向量数据库存储和报表生成整合在一起 """ import os from dotenv import load_dotenv from crawler.wechat_crawler import WeChatCrawler from rag import RAGProcessor from report import ReportGenerator def main(): """主函数""" load_dotenv() # 获取配置 appid = os.getenv("WECHAT_APPID") appsecret = os.getenv("WECHAT_APPSECRET") qwen_api_key = os.getenv("QWEN_API_KEY") qwen_model = os.getenv("QWEN_MODEL", "qwen-max") data_dir = os.getenv("DATA_DIR", "./data") vector_db_dir = os.getenv("VECTOR_DB_DIR", "./vector_db") if not appid or not appsecret: print("错误: 请在.env文件中配置WECHAT_APPID和WECHAT_APPSECRET") return print("="*60) print("微信公众号文章分析系统 - 高德地图") print("="*60) # 步骤1: 爬取微信文章 print("\n[步骤 1/3] 爬取微信公众号文章...") wechat_crawler = WeChatCrawler(appid=appid, appsecret=appsecret) articles = wechat_crawler.crawl_articles( max_count=30, keyword="交通" # 可以修改关键词或留空获取所有文章 ) if not articles: print("未能获取到文章") return print(f"[成功] 爬取了 {len(articles)} 篇文章") # 步骤2: 保存到向量数据库 print("\n[步骤 2/3] 将文章存入向量数据库...") rag_processor = RAGProcessor(vector_db_dir=vector_db_dir) # 转换格式并组合标题+内容 news_list = [] for article in articles: # 标题和内容一起嵌入向量数据库 combined_content = f"标题: {article['title']}\n\n{article['content']}" news_item = { "title": article["title"], "content": combined_content, "url": article["url"], "source": article["source"], "std_timestamp": article.get("std_timestamp", 0), } news_list.append(news_item) # 使用upsert模式存储 rag_processor.process_news(news_list, upsert=True) db_stats = rag_processor.get_database_stats() print(f"[成功] 向量数据库现有 {db_stats['unique_news']} 条新闻") # 步骤3: 生成分析报表 print("\n[步骤 3/3] 生成分析报表...") report_generator = ReportGenerator(api_key=qwen_api_key, model_name=qwen_model) # 检索相关内容增强报表 relevant_docs = rag_processor.search("交通 导航 路况", k=10) report = report_generator.generate_summary_report(news_list, relevant_docs) # 保存报表 report_path = report_generator.save_report( report, report_type='wechat_gaode', output_dir=data_dir ) print(f"[成功] 报表已生成: {report_path}") # 打印报表预览 print("\n" + "="*60) print("报表内容预览:") print("="*60) print(report) print("\n" + "="*60) print("任务完成!") print("="*60) if __name__ == "__main__": main()