通过授权账号( AppID 和 AppSecret)爬取微信公众号【某个具体的公众号】中所有的文章内容存放到本地D盘
使用https:
但在获取过程中发现获取的文章数量不全,总共有84篇文章,多次调试始终只获取到早期的32篇,请问问题出在哪里?
具体代码详情如下:
import requests
import os
import re
import time
APP_ID = 'AppID '
APP_SECRET = 'AppSecret'
SAVE_DIR = 'D:/wechat_articles'
def get_access_token():
"""
获取微信公众号的 Access Token
"""
url = f'https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={APP_ID}&secret={APP_SECRET}'
response = requests.get(url)
result = response.json()
if 'access_token' in result:
print(result['access_token'])
return result['access_token']
else:
print(f"获取 Access Token 失败: {result}")
return None
def sanitize_filename(filename):
"""
处理文件名中的非法字符,并确保编码正确
"""
invalid_chars = r'[<>:"/\\|?*\x00-\x1F]'
sanitized = re.sub(invalid_chars, '_', filename)
return sanitized
def save_article(title, content):
"""
将文章保存到本地文件
"""
safe_title = sanitize_filename(title)
file_path = os.path.join(SAVE_DIR, f'{safe_title}.html')
try:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"文章 {title} 已保存到 {file_path}")
except Exception as e:
print(f"保存文章 {title} 失败: {e}")
def get_articles(access_token):
"""
获取公众号的所有图文素材并保存到本地
"""
if not os.path.exists(SAVE_DIR):
os.makedirs(SAVE_DIR)
url = 'https://api.weixin.qq.com/cgi-bin/material/batchget_material'
offset = 0
count = 20
while True:
data = {
"type": "news",
"offset": offset,
"count": count
}
response = requests.post(url, params={'access_token': access_token}, json=data)
response.encoding = 'utf-8'
result = response.json()
if 'total_count' in result:
print(result['total_count'])
else:
print(f"响应中未包含 total_count 字段: {result}")
total_articles = 0
if 'item' in result:
items = result['item']
if not items:
break
for item in items:
article_info = item['content']['news_item']
total_articles += len(article_info)
for article in article_info:
title = article['title']
content = article['content']
save_article(title, content)
offset += count
time.sleep(1)
else:
print(f"获取文章列表失败: {result}")
break
print(f"实际保存的文章总数: {total_articles}")
if __name__ == "__main__":
access_token = get_access_token()
if access_token:
get_articles(access_token)
从你的代码中看到,你调用的是 /cgi-bin/material/batchget_material 接口。
/cgi-bin/material/batchget_material 为获取永久素材的列表接口, type: "news" 时获取的是【内容管理——草稿箱——历史图文素材】里面的内容(即草稿箱功能上线之前所编辑的图文素材)。