@远方很远 from multiprocess import Process
import time
def func():
for i in range(10):
print( 子 ,i)
time.sleep(1)
if __name__== __main__ :
p=Process(target=func)
p.start()
for i in range(10):
print( 主 ,i)
time.sleep(1)
为什么开的这么难
你复制我得看看
我觉得我的模块有问题
import requests
import re
import json
import os
from requests.exceptions import RequestException
headers = {
User-Agent : Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 ,
Referer :
https://maoyan.com ,
Cookie : __mta=150257545.1631492828404.1631494019461.1631494128206.11; uuid_n_v=v1; uuid=527644E0142911ECB55CBB305D2AA0B10206AFBAB4CB475088C931781076E1FA; _csrf=5e4a62b4751f4510de197b6bc0c95135582879125c178b5ff5f2ebd94019dba5; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1631492828; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17bdc8cea2dc8-00c198070d82d5-3e604809-15f900-17bdc8cea2ec8; _lxsdk=527644E0142911ECB55CBB305D2AA0B10206AFBAB4CB475088C931781076E1FA; __mta=150257545.1631492828404.1631492828404.1631492831171.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1631494128; _lxsdk_s=17bdc8cea2f-7eb-b35-003%7C%7C26
}
# 对单页的网页源代码进行爬取
def get_one_page(url, headers):
try:
response = requests.get(url, headers=headers)#获取网页
if response.status_code == 200: #确认网页信息是不是200,200是正常的
return response.text
return None
except RequestException:
return None
#解析这个网页
#正则表达式提取信息
def parse_one_page(html):
pattern = re.compile( dd .*?board-index.*? (\d+) /i .*?data-src= (.*?) .*?name a
+ .*? (.*?) /a .*?star (.*?) /p .*?releasetime (.*?) /p .*?integer (.*?) /i .*?fraction (.*?) /i .*? /dd ,
re.S)
items = re.findall(pattern, html)
for item in items:
yield {
index : item[0],
image : item[1],
title : item[2],
actor : item[3].strip()[3:],
time : item[4].strip()[5:],
score : item[5] + item[6]
}
# 将信息写入文件
def write_to_file(content):
# encoding = utf-8 ,ensure_ascii =False,使写入文件的代码显示为中文
with open( result.txt , a , encoding= utf-8 ) as f:
f.write(json.dumps(content, ensure_ascii=False) + \n )
f.close()
# 下载电影封面
def save_image_file(url, path):
jd = requests.get(url)
if jd.status_code == 200:
with open(path, wb ) as f:
f.write(jd.content)
f.close()
#主函数
def main(offset):
url =
https://maoyan.com/board/4?offset= + str(offset)
html = get_one_page(url, headers)
if not os.path.exists( movieImgs ):
os.mkdir( movieImgs )
for item in parse_one_page(html):
print(item)
write_to_file(item)
save_image_file(item[ image ], movieImgs/ + item[ title ] + .jpg )
if __name__ == __main__ :
# 对每一页信息进行爬取
for i in range(10):
main(i*10)那我应该从哪里加,不知道从哪继续爬
正则建议全部换了,用lxml
一般我用的
r1=requests.get(url=urls,headers=headers(),verify=False)
r1=r1.content.decode( gbk ,errors= ignore )
obj1=etree.HTML(r1)
soup=BeautifulSoup(r1, lxml )
标题=obj1.xpath( //*[@id= content ]/div[1]/h1/text() )标题就直接出来了
nexturls=obj1.xpath( //*[@id= content ]/div[1]/div/a[3]/@href ) nexturls=starturl+nexturls[0] r2=requests.get(url=nexturls,headers=headers(),verify=False) r2=r2.content.decode( gbk ,errors= ignore )
开始第二层爬取,另外单个爬取效率太低了,建议建立header池
外加使用多线程,我爬个小说用了一个多小时
我的妈呀哈哈哈,我得好好研究了
应数专业已经哭了
还要建立UA池,代理IP池 