103 lines
3.0 KiB
Python
103 lines
3.0 KiB
Python
|
|
"""
|
||
|
|
微信公众号爬虫集成示例
|
||
|
|
演示如何将微信文章爬取、向量数据库存储和报表生成整合在一起
|
||
|
|
"""
|
||
|
|
import os
|
||
|
|
from dotenv import load_dotenv
|
||
|
|
from crawler.wechat_crawler import WeChatCrawler
|
||
|
|
from rag import RAGProcessor
|
||
|
|
from report import ReportGenerator
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""主函数"""
|
||
|
|
load_dotenv()
|
||
|
|
|
||
|
|
# 获取配置
|
||
|
|
appid = os.getenv("WECHAT_APPID")
|
||
|
|
appsecret = os.getenv("WECHAT_APPSECRET")
|
||
|
|
qwen_api_key = os.getenv("QWEN_API_KEY")
|
||
|
|
qwen_model = os.getenv("QWEN_MODEL", "qwen-max")
|
||
|
|
data_dir = os.getenv("DATA_DIR", "./data")
|
||
|
|
vector_db_dir = os.getenv("VECTOR_DB_DIR", "./vector_db")
|
||
|
|
|
||
|
|
if not appid or not appsecret:
|
||
|
|
print("错误: 请在.env文件中配置WECHAT_APPID和WECHAT_APPSECRET")
|
||
|
|
return
|
||
|
|
|
||
|
|
print("="*60)
|
||
|
|
print("微信公众号文章分析系统 - 高德地图")
|
||
|
|
print("="*60)
|
||
|
|
|
||
|
|
# 步骤1: 爬取微信文章
|
||
|
|
print("\n[步骤 1/3] 爬取微信公众号文章...")
|
||
|
|
wechat_crawler = WeChatCrawler(appid=appid, appsecret=appsecret)
|
||
|
|
|
||
|
|
articles = wechat_crawler.crawl_articles(
|
||
|
|
max_count=30,
|
||
|
|
keyword="交通" # 可以修改关键词或留空获取所有文章
|
||
|
|
)
|
||
|
|
|
||
|
|
if not articles:
|
||
|
|
print("未能获取到文章")
|
||
|
|
return
|
||
|
|
|
||
|
|
print(f"[成功] 爬取了 {len(articles)} 篇文章")
|
||
|
|
|
||
|
|
# 步骤2: 保存到向量数据库
|
||
|
|
print("\n[步骤 2/3] 将文章存入向量数据库...")
|
||
|
|
rag_processor = RAGProcessor(vector_db_dir=vector_db_dir)
|
||
|
|
|
||
|
|
# 转换格式并组合标题+内容
|
||
|
|
news_list = []
|
||
|
|
for article in articles:
|
||
|
|
# 标题和内容一起嵌入向量数据库
|
||
|
|
combined_content = f"标题: {article['title']}\n\n{article['content']}"
|
||
|
|
|
||
|
|
news_item = {
|
||
|
|
"title": article["title"],
|
||
|
|
"content": combined_content,
|
||
|
|
"url": article["url"],
|
||
|
|
"source": article["source"],
|
||
|
|
"std_timestamp": article.get("std_timestamp", 0),
|
||
|
|
}
|
||
|
|
news_list.append(news_item)
|
||
|
|
|
||
|
|
# 使用upsert模式存储
|
||
|
|
rag_processor.process_news(news_list, upsert=True)
|
||
|
|
|
||
|
|
db_stats = rag_processor.get_database_stats()
|
||
|
|
print(f"[成功] 向量数据库现有 {db_stats['unique_news']} 条新闻")
|
||
|
|
|
||
|
|
# 步骤3: 生成分析报表
|
||
|
|
print("\n[步骤 3/3] 生成分析报表...")
|
||
|
|
report_generator = ReportGenerator(api_key=qwen_api_key, model_name=qwen_model)
|
||
|
|
|
||
|
|
# 检索相关内容增强报表
|
||
|
|
relevant_docs = rag_processor.search("交通 导航 路况", k=10)
|
||
|
|
|
||
|
|
report = report_generator.generate_summary_report(news_list, relevant_docs)
|
||
|
|
|
||
|
|
# 保存报表
|
||
|
|
report_path = report_generator.save_report(
|
||
|
|
report,
|
||
|
|
report_type='wechat_gaode',
|
||
|
|
output_dir=data_dir
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f"[成功] 报表已生成: {report_path}")
|
||
|
|
|
||
|
|
# 打印报表预览
|
||
|
|
print("\n" + "="*60)
|
||
|
|
print("报表内容预览:")
|
||
|
|
print("="*60)
|
||
|
|
print(report)
|
||
|
|
|
||
|
|
print("\n" + "="*60)
|
||
|
|
print("任务完成!")
|
||
|
|
print("="*60)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|