newsreport_agent_for_traffic/crawler/news_crawler.py

136 lines
4.7 KiB
Python
Raw Permalink Normal View History

2026-05-09 10:46:52 +08:00
"""
赛文交通网新闻爬虫模块
"""
import json
import os
import time
from datetime import datetime, timedelta, timezone
from typing import Dict, List
import requests
from bs4 import BeautifulSoup
class TrafficNewsCrawler:
"""交通新闻爬虫类。"""
@staticmethod
def _build_std_timestamp(date_text: str) -> int:
"""Build a Unix timestamp for 08:00:00 Asia/Shanghai on the given date."""
if not date_text:
return 0
try:
beijing_tz = timezone(timedelta(hours=8))
dt = datetime.strptime(date_text, "%Y-%m-%d").replace(
hour=8,
minute=0,
second=0,
microsecond=0,
tzinfo=beijing_tz,
)
return int(dt.timestamp())
except ValueError:
return 0
def __init__(self, base_url: str = "https://www.7its.com/"):
self.base_url = base_url
self.headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.124 Safari/537.36"
)
}
def fetch_news_list(self, category: str = "news", page: int = 1) -> List[Dict]:
"""获取新闻列表。"""
del category, page
news_list = []
try:
url = f"{self.base_url}index.php?m=home&c=Lists&a=index&tid=1"
response = requests.get(url, headers=self.headers, timeout=10)
response.encoding = "utf-8"
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
all_links = soup.find_all("a", href=True)
for link in all_links:
href = link.get("href", "")
text = link.get_text(strip=True)
if ("Article" in href or "aid=" in href) and len(text) > 10:
date_text = datetime.now().strftime("%Y-%m-%d")
news_item = {
"title": text,
"url": href,
"date": date_text,
"std_timestamp": self._build_std_timestamp(date_text),
"source": "赛文交通网",
"crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
if not news_item["url"].startswith("http"):
news_item["url"] = self.base_url.rstrip("/") + news_item["url"]
news_list.append(news_item)
except Exception as exc:
print(f"获取新闻列表失败: {exc}")
return news_list
def fetch_news_content(self, url: str) -> str:
"""获取新闻详情内容。"""
try:
response = requests.get(url, headers=self.headers, timeout=10)
response.encoding = "utf-8"
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
content_div = (
soup.find("div", class_="content")
or soup.find("article")
or soup.find("div", class_="article-content")
)
if content_div:
for script in content_div(["script", "style"]):
script.decompose()
return content_div.get_text(strip=True, separator="\n")
except Exception as exc:
print(f"获取新闻内容失败 {url}: {exc}")
return ""
def crawl_and_save(self, output_dir: str = "./data", max_news: int = 50) -> List[Dict]:
"""爬取新闻并保存。"""
os.makedirs(output_dir, exist_ok=True)
all_news = []
news_list = self.fetch_news_list()
print(f"获取到 {len(news_list)} 条新闻标题")
for i, news in enumerate(news_list[:max_news]):
print(f"正在爬取第 {i + 1}/{min(len(news_list), max_news)} 条新闻: {news['title']}")
content = self.fetch_news_content(news["url"])
news["content"] = content
all_news.append(news)
time.sleep(1)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = os.path.join(output_dir, f"news_{timestamp}.json")
with open(output_file, "w", encoding="utf-8") as file:
json.dump(all_news, file, ensure_ascii=False, indent=2)
print(f"新闻已保存到: {output_file}")
return all_news
if __name__ == "__main__":
crawler = TrafficNewsCrawler()
news = crawler.crawl_and_save(max_news=10)
print(f"共爬取 {len(news)} 条新闻")