""" 赛文交通网新闻爬虫模块 """ import json import os import time from datetime import datetime, timedelta, timezone from typing import Dict, List import requests from bs4 import BeautifulSoup class TrafficNewsCrawler: """交通新闻爬虫类。""" @staticmethod def _build_std_timestamp(date_text: str) -> int: """Build a Unix timestamp for 08:00:00 Asia/Shanghai on the given date.""" if not date_text: return 0 try: beijing_tz = timezone(timedelta(hours=8)) dt = datetime.strptime(date_text, "%Y-%m-%d").replace( hour=8, minute=0, second=0, microsecond=0, tzinfo=beijing_tz, ) return int(dt.timestamp()) except ValueError: return 0 def __init__(self, base_url: str = "https://www.7its.com/"): self.base_url = base_url self.headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/91.0.4472.124 Safari/537.36" ) } def fetch_news_list(self, category: str = "news", page: int = 1) -> List[Dict]: """获取新闻列表。""" del category, page news_list = [] try: url = f"{self.base_url}index.php?m=home&c=Lists&a=index&tid=1" response = requests.get(url, headers=self.headers, timeout=10) response.encoding = "utf-8" if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") all_links = soup.find_all("a", href=True) for link in all_links: href = link.get("href", "") text = link.get_text(strip=True) if ("Article" in href or "aid=" in href) and len(text) > 10: date_text = datetime.now().strftime("%Y-%m-%d") news_item = { "title": text, "url": href, "date": date_text, "std_timestamp": self._build_std_timestamp(date_text), "source": "赛文交通网", "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } if not news_item["url"].startswith("http"): news_item["url"] = self.base_url.rstrip("/") + news_item["url"] news_list.append(news_item) except Exception as exc: print(f"获取新闻列表失败: {exc}") return news_list def fetch_news_content(self, url: str) -> str: """获取新闻详情内容。""" try: response = requests.get(url, headers=self.headers, timeout=10) response.encoding = "utf-8" if response.status_code == 200: soup = BeautifulSoup(response.text, "lxml") content_div = ( soup.find("div", class_="content") or soup.find("article") or soup.find("div", class_="article-content") ) if content_div: for script in content_div(["script", "style"]): script.decompose() return content_div.get_text(strip=True, separator="\n") except Exception as exc: print(f"获取新闻内容失败 {url}: {exc}") return "" def crawl_and_save(self, output_dir: str = "./data", max_news: int = 50) -> List[Dict]: """爬取新闻并保存。""" os.makedirs(output_dir, exist_ok=True) all_news = [] news_list = self.fetch_news_list() print(f"获取到 {len(news_list)} 条新闻标题") for i, news in enumerate(news_list[:max_news]): print(f"正在爬取第 {i + 1}/{min(len(news_list), max_news)} 条新闻: {news['title']}") content = self.fetch_news_content(news["url"]) news["content"] = content all_news.append(news) time.sleep(1) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = os.path.join(output_dir, f"news_{timestamp}.json") with open(output_file, "w", encoding="utf-8") as file: json.dump(all_news, file, ensure_ascii=False, indent=2) print(f"新闻已保存到: {output_file}") return all_news if __name__ == "__main__": crawler = TrafficNewsCrawler() news = crawler.crawl_and_save(max_news=10) print(f"共爬取 {len(news)} 条新闻")