136 lines
4.7 KiB
Python
136 lines
4.7 KiB
Python
"""
|
|
赛文交通网新闻爬虫模块
|
|
"""
|
|
import json
|
|
import os
|
|
import time
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import Dict, List
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
class TrafficNewsCrawler:
|
|
"""交通新闻爬虫类。"""
|
|
|
|
@staticmethod
|
|
def _build_std_timestamp(date_text: str) -> int:
|
|
"""Build a Unix timestamp for 08:00:00 Asia/Shanghai on the given date."""
|
|
if not date_text:
|
|
return 0
|
|
try:
|
|
beijing_tz = timezone(timedelta(hours=8))
|
|
dt = datetime.strptime(date_text, "%Y-%m-%d").replace(
|
|
hour=8,
|
|
minute=0,
|
|
second=0,
|
|
microsecond=0,
|
|
tzinfo=beijing_tz,
|
|
)
|
|
return int(dt.timestamp())
|
|
except ValueError:
|
|
return 0
|
|
|
|
def __init__(self, base_url: str = "https://www.7its.com/"):
|
|
self.base_url = base_url
|
|
self.headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/91.0.4472.124 Safari/537.36"
|
|
)
|
|
}
|
|
|
|
def fetch_news_list(self, category: str = "news", page: int = 1) -> List[Dict]:
|
|
"""获取新闻列表。"""
|
|
del category, page
|
|
news_list = []
|
|
try:
|
|
url = f"{self.base_url}index.php?m=home&c=Lists&a=index&tid=1"
|
|
response = requests.get(url, headers=self.headers, timeout=10)
|
|
response.encoding = "utf-8"
|
|
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
all_links = soup.find_all("a", href=True)
|
|
|
|
for link in all_links:
|
|
href = link.get("href", "")
|
|
text = link.get_text(strip=True)
|
|
|
|
if ("Article" in href or "aid=" in href) and len(text) > 10:
|
|
date_text = datetime.now().strftime("%Y-%m-%d")
|
|
news_item = {
|
|
"title": text,
|
|
"url": href,
|
|
"date": date_text,
|
|
"std_timestamp": self._build_std_timestamp(date_text),
|
|
"source": "赛文交通网",
|
|
"crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
}
|
|
|
|
if not news_item["url"].startswith("http"):
|
|
news_item["url"] = self.base_url.rstrip("/") + news_item["url"]
|
|
|
|
news_list.append(news_item)
|
|
|
|
except Exception as exc:
|
|
print(f"获取新闻列表失败: {exc}")
|
|
|
|
return news_list
|
|
|
|
def fetch_news_content(self, url: str) -> str:
|
|
"""获取新闻详情内容。"""
|
|
try:
|
|
response = requests.get(url, headers=self.headers, timeout=10)
|
|
response.encoding = "utf-8"
|
|
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
content_div = (
|
|
soup.find("div", class_="content")
|
|
or soup.find("article")
|
|
or soup.find("div", class_="article-content")
|
|
)
|
|
|
|
if content_div:
|
|
for script in content_div(["script", "style"]):
|
|
script.decompose()
|
|
|
|
return content_div.get_text(strip=True, separator="\n")
|
|
except Exception as exc:
|
|
print(f"获取新闻内容失败 {url}: {exc}")
|
|
|
|
return ""
|
|
|
|
def crawl_and_save(self, output_dir: str = "./data", max_news: int = 50) -> List[Dict]:
|
|
"""爬取新闻并保存。"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
all_news = []
|
|
news_list = self.fetch_news_list()
|
|
|
|
print(f"获取到 {len(news_list)} 条新闻标题")
|
|
|
|
for i, news in enumerate(news_list[:max_news]):
|
|
print(f"正在爬取第 {i + 1}/{min(len(news_list), max_news)} 条新闻: {news['title']}")
|
|
content = self.fetch_news_content(news["url"])
|
|
news["content"] = content
|
|
all_news.append(news)
|
|
time.sleep(1)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_file = os.path.join(output_dir, f"news_{timestamp}.json")
|
|
|
|
with open(output_file, "w", encoding="utf-8") as file:
|
|
json.dump(all_news, file, ensure_ascii=False, indent=2)
|
|
|
|
print(f"新闻已保存到: {output_file}")
|
|
return all_news
|
|
|
|
|
|
if __name__ == "__main__":
|
|
crawler = TrafficNewsCrawler()
|
|
news = crawler.crawl_and_save(max_news=10)
|
|
print(f"共爬取 {len(news)} 条新闻") |