- 通过授权账号爬取微信公众号文章,发现获取的文章数量不全,请问问题出在哪里?
通过授权账号( AppID 和 AppSecret)爬取微信公众号【某个具体的公众号】中所有的文章内容存放到本地D盘 使用https://api.weixin.qq.com/cgi-bin/material/get_material?access_token=ACCESS_TOKEN 接口获取 但在获取过程中发现获取的文章数量不全,总共有84篇文章,多次调试始终只获取到早期的32篇,请问问题出在哪里? 具体代码详情如下: import requests import os import re import time # 替换为你的公众号 AppID 和 AppSecret APP_ID = 'AppID ' APP_SECRET = 'AppSecret' # 存储文章的本地目录 SAVE_DIR = 'D:/wechat_articles' def get_access_token(): """ 获取微信公众号的 Access Token """ url = f'https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={APP_ID}&secret={APP_SECRET}' response = requests.get(url) result = response.json() if 'access_token' in result: print(result['access_token']) return result['access_token'] else: print(f"获取 Access Token 失败: {result}") return None def sanitize_filename(filename): """ 处理文件名中的非法字符,并确保编码正确 """ # 移除非法字符 invalid_chars = r'[<>:"/\\|?*\x00-\x1F]' sanitized = re.sub(invalid_chars, '_', filename) return sanitized def save_article(title, content): """ 将文章保存到本地文件 """ # 处理文件名 safe_title = sanitize_filename(title) file_path = os.path.join(SAVE_DIR, f'{safe_title}.html') try: with open(file_path, 'w', encoding='utf-8') as f: f.write(content) print(f"文章 {title} 已保存到 {file_path}") except Exception as e: print(f"保存文章 {title} 失败: {e}") def get_articles(access_token): """ 获取公众号的所有图文素材并保存到本地 """ if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) url = 'https://api.weixin.qq.com/cgi-bin/material/batchget_material' offset = 0 count = 20 #start_date = "2020-01-01" #end_date = "2025-04-21" # 微信接口仅支持查询到昨日 while True: data = { "type": "news", "offset": offset, "count": count } response = requests.post(url, params={'access_token': access_token}, json=data) # 确保响应使用正确的编码 response.encoding = 'utf-8' result = response.json() #print(result) if 'total_count' in result: print(result['total_count']) else: print(f"响应中未包含 total_count 字段: {result}") total_articles = 0 if 'item' in result: items = result['item'] if not items: break for item in items: article_info = item['content']['news_item'] total_articles += len(article_info) for article in article_info: title = article['title'] content = article['content'] save_article(title, content) offset += count # 防止触发频率限制 time.sleep(1) # 适当延时,建议1秒以上 else: print(f"获取文章列表失败: {result}") break print(f"实际保存的文章总数: {total_articles}") if __name__ == "__main__": access_token = get_access_token() if access_token: get_articles(access_token)
05-02 - 通过freepublish/batchget接口获取公众号文章不全,如何解决?
https://api.weixin.qq.com/cgi-bin/freepublish/batchget 通过freepublish/batchget接口获取公众号文章不全,“已通知”状态的文章没获取到,已做分页处理,文章总数中也不包括“已通知”状态的文章,如何解决?
05-02