newsreport_agent_for_traffic/crawler/news_crawler.py

136 lines
4.7 KiB
Python

"""
赛文交通网新闻爬虫模块
"""
import json
import os
import time
from datetime import datetime, timedelta, timezone
from typing import Dict, List
import requests
from bs4 import BeautifulSoup
class TrafficNewsCrawler:
"""交通新闻爬虫类。"""
@staticmethod
def _build_std_timestamp(date_text: str) -> int:
"""Build a Unix timestamp for 08:00:00 Asia/Shanghai on the given date."""
if not date_text:
return 0
try:
beijing_tz = timezone(timedelta(hours=8))
dt = datetime.strptime(date_text, "%Y-%m-%d").replace(
hour=8,
minute=0,
second=0,
microsecond=0,
tzinfo=beijing_tz,
)
return int(dt.timestamp())
except ValueError:
return 0
def __init__(self, base_url: str = "https://www.7its.com/"):
self.base_url = base_url
self.headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.124 Safari/537.36"
)
}
def fetch_news_list(self, category: str = "news", page: int = 1) -> List[Dict]:
"""获取新闻列表。"""
del category, page
news_list = []
try:
url = f"{self.base_url}index.php?m=home&c=Lists&a=index&tid=1"
response = requests.get(url, headers=self.headers, timeout=10)
response.encoding = "utf-8"
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
all_links = soup.find_all("a", href=True)
for link in all_links:
href = link.get("href", "")
text = link.get_text(strip=True)
if ("Article" in href or "aid=" in href) and len(text) > 10:
date_text = datetime.now().strftime("%Y-%m-%d")
news_item = {
"title": text,
"url": href,
"date": date_text,
"std_timestamp": self._build_std_timestamp(date_text),
"source": "赛文交通网",
"crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
if not news_item["url"].startswith("http"):
news_item["url"] = self.base_url.rstrip("/") + news_item["url"]
news_list.append(news_item)
except Exception as exc:
print(f"获取新闻列表失败: {exc}")
return news_list
def fetch_news_content(self, url: str) -> str:
"""获取新闻详情内容。"""
try:
response = requests.get(url, headers=self.headers, timeout=10)
response.encoding = "utf-8"
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
content_div = (
soup.find("div", class_="content")
or soup.find("article")
or soup.find("div", class_="article-content")
)
if content_div:
for script in content_div(["script", "style"]):
script.decompose()
return content_div.get_text(strip=True, separator="\n")
except Exception as exc:
print(f"获取新闻内容失败 {url}: {exc}")
return ""
def crawl_and_save(self, output_dir: str = "./data", max_news: int = 50) -> List[Dict]:
"""爬取新闻并保存。"""
os.makedirs(output_dir, exist_ok=True)
all_news = []
news_list = self.fetch_news_list()
print(f"获取到 {len(news_list)} 条新闻标题")
for i, news in enumerate(news_list[:max_news]):
print(f"正在爬取第 {i + 1}/{min(len(news_list), max_news)} 条新闻: {news['title']}")
content = self.fetch_news_content(news["url"])
news["content"] = content
all_news.append(news)
time.sleep(1)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = os.path.join(output_dir, f"news_{timestamp}.json")
with open(output_file, "w", encoding="utf-8") as file:
json.dump(all_news, file, ensure_ascii=False, indent=2)
print(f"新闻已保存到: {output_file}")
return all_news
if __name__ == "__main__":
crawler = TrafficNewsCrawler()
news = crawler.crawl_and_save(max_news=10)
print(f"共爬取 {len(news)} 条新闻")