新闻动态

Python抓取今日头条街拍图片数据

发布日期:2022-02-02 08:19 | 文章来源:脚本之家

(1)抓取今日头条街拍图片

(2)分析今日头条街拍图片结构

keyword: 街拍
pd: atlas
dvpf: pc
aid: 4916
page_num: 1
search_json: {"from_search_id":"20220104115420010212192151532E8188","origin_keyword":"街拍","image_keyword":"街拍"}
rawJSON: 1
search_id: 202201041159040101501341671A4749C4
可以找到规律,page_num从1开始累加,其他参数不变

(3)按功能不同编写不同方法组织代码

获取网页json格式数据

def get_page(page_num):
 global headers
 headers = {
  'Host': 'so.toutiao.com',
  #'Referer': 'https://so.toutiao.com/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&dvpf=pc&aid=4916&page_num=0&search_json={%22from_search_id%22:%22202112272022060101510440283EE83D67%22,%22origin_keyword%22:%22%E8%A1%97%E6%8B%8D%22,%22image_keyword%22:%22%E8%A1%97%E6%8B%8D%22}',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
  'X-Requested-With': 'XMLHttpRequest',
  'Cookie': 'msToken=S0DFBkZ9hmyLOGYd3_QjhhXgrm38qTyOITnkNb0t_oavfbVxuYV1JZ0tT5hLgswSfmZLFD6c2lONm_5TomUQXVXjen7CIxM2AGwbhHRYKjhg; _S_DPR=1.5; _S_IPAD=0; MONITOR_WEB_ID=7046351002275317255; ttwid=1%7C0YdWalNdIiSpIk3CvvHwV25U8drq3QAj08E8QOApXhs%7C1640607595%7C720e971d353416921df127996ed708931b4ae28a0a8691a5466347697e581ce8; _S_WIN_WH=262_623'
 }
 params = {
  'keyword': '街拍',
  'pd': 'atlas',
  'dvpf': 'pc',
  'aid': '4916',
  'page_num': page_num,
  'search_json': '%7B%22from_search_id%22%3A%22202112272022060101510440283EE83D67%22%2C%22origin_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22image_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%7D',
  'rawJSON': 1,
  'search_id': '2021122721183101015104402851E3883D'
 }
 url = 'https://so.toutiao.com/search?' + urlencode(params)
 print(url)
 try:
  response=requests.get(url,headers=headers,params=params)
  if response.status_code == 200:
  #if response.content:
#print(response.json())
return response.json()
 except requests.ConnectionError:
  return None

从json格式数据提取街拍图片

def get_images(json):
 images = json.get('rawData').get('data')
 for image in images:
  link = image.get('img_url')
  yield link

将街拍图片以其md5码命名并保存图片

实现一个保存图片的方法 save_image(),其中 item 就是前面 get_images() 方法返回的一个字典。在该方法中,首先根据 item 的 title 来创建文件夹,然后请求这个图片链接,获取图片的二进制数据,以二进制的形式写入文件。图片的名称可以使用其内容的 MD5 值,这样可以去除重复。相关代码如下:

def save_image(link):
 data = requests.get(link).content
 with open(f'./image/{md5(data).hexdigest()}.jpg', 'wb')as f:#使用data的md5码作为图片名
  f.write(data)

main()调用其他函数

def main(page_num):
 json = get_page(page_num)
 for link in get_images(json):
  #print(link)
  save_image(link)

(4)抓取20page今日头条街拍图片数据

这里定义了分页的起始页数和终止页数,分别为 GROUP_START 和 GROUP_END,还利用了多线程的线程池,调用其 map() 方法实现多线程下载。

if __name__ == '__main__':
 GROUP_START = 1
 GROUP_END = 20
 pool = Pool()
 groups = ([x for x in range(GROUP_START, GROUP_END + 1)])
 #print(groups)
 pool.map(main, groups)
 pool.close()
 pool.join()
import requests
from urllib.parse import urlencode
from hashlib import md5
from multiprocessing.pool import Pool
def get_page(page_num):
 global headers
 headers = {
  'Host': 'so.toutiao.com',
  #'Referer': 'https://so.toutiao.com/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&dvpf=pc&aid=4916&page_num=0&search_json={%22from_search_id%22:%22202112272022060101510440283EE83D67%22,%22origin_keyword%22:%22%E8%A1%97%E6%8B%8D%22,%22image_keyword%22:%22%E8%A1%97%E6%8B%8D%22}',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
  'X-Requested-With': 'XMLHttpRequest',
  'Cookie': 'msToken=S0DFBkZ9hmyLOGYd3_QjhhXgrm38qTyOITnkNb0t_oavfbVxuYV1JZ0tT5hLgswSfmZLFD6c2lONm_5TomUQXVXjen7CIxM2AGwbhHRYKjhg; _S_DPR=1.5; _S_IPAD=0; MONITOR_WEB_ID=7046351002275317255; ttwid=1%7C0YdWalNdIiSpIk3CvvHwV25U8drq3QAj08E8QOApXhs%7C1640607595%7C720e971d353416921df127996ed708931b4ae28a0a8691a5466347697e581ce8; _S_WIN_WH=262_623'
 }
 params = {
  'keyword': '街拍',
  'pd': 'atlas',
  'dvpf': 'pc',
  'aid': '4916',
  'page_num': page_num,
  'search_json': '%7B%22from_search_id%22%3A%22202112272022060101510440283EE83D67%22%2C%22origin_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22image_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%7D',
  'rawJSON': 1,
  'search_id': '2021122721183101015104402851E3883D'
 }
 url = 'https://so.toutiao.com/search?' + urlencode(params)
 print(url)
 try:
  response=requests.get(url,headers=headers,params=params)
  if response.status_code == 200:
  #if response.content:
#print(response.json())
return response.json()
 except requests.ConnectionError:
  return None
def get_images(json):
 images = json.get('rawData').get('data')
 for image in images:
  link = image.get('img_url')
  yield link

def save_image(link):
 data = requests.get(link).content
 with open(f'./image/{md5(data).hexdigest()}.jpg', 'wb')as f:#使用data的md5码作为图片名
  f.write(data)def main(page_num):
 json = get_page(page_num)
 for link in get_images(json):
  #print(link)
  save_image(link)if __name__ == '__main__':
 GROUP_START = 1
 GROUP_END = 20
 pool = Pool()
 groups = ([x for x in range(GROUP_START, GROUP_END + 1)])
 #print(groups)
 pool.map(main, groups)
 pool.close()
 pool.join()

到此这篇关于Python抓取今日头条街拍图片数据的文章就介绍到这了,更多相关Python抓取今日头条图片内容请搜索本站以前的文章或继续浏览下面的相关文章希望大家以后多多支持本站!

美国服务器租用

版权声明:本站文章来源标注为YINGSOO的内容版权均为本站所有,欢迎引用、转载,请保持原文完整并注明来源及原文链接。禁止复制或仿造本网站,禁止在非www.yingsoo.com所属的服务器上建立镜像,否则将依法追究法律责任。本站部分内容来源于网友推荐、互联网收集整理而来,仅供学习参考,不代表本站立场,如有内容涉嫌侵权,请联系alex-e#qq.com处理。

相关文章

实时开通

自选配置、实时开通

免备案

全球线路精选!

全天候客户服务

7x24全年不间断在线

专属顾问服务

1对1客户咨询顾问

在线
客服

在线客服:7*24小时在线

客服
热线

400-630-3752
7*24小时客服服务热线

关注
微信

关注官方微信
顶部