思路:先组装 需要访问的url 链接,然后用asyncio协程批量去aiohttp请求,把返回的response数据 用BeautifulSoup处理得到我们想要的结果,然后把数据插入mongo数据库。
import asyncio
import time
import aiohttp
import async_timeout
from bs4 import BeautifulSoup
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["python"]
mycol = mydb["movie"]
''''异步并发获取url数据,爬虫某网'''
temp_data = []
linklist = []
link_texts = []
link_nums = []
msg = "https://www.xxx.com/mdb/film/list/year-1908/o0d0p{}.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
urls = [msg.format(i) for i in range(1, 7)]
print(urls) ##打印url 数组
data = bytearray()
async def fetch(session, url):
with async_timeout.timeout(10):
async with session.get(url) as response:
html = await response.text(encoding=None, errors='ignore')
return response.status,html
async def main(url):
async with aiohttp.ClientSession() as session:
status,html = await fetch(session, url)
if status == 200:
return html
else:
return ''
if __name__ == '__main__':
start = time.time()
loop = asyncio.get_event_loop()
tasks = [main(url) for url in urls]
# 返回一个列表,内容为各个tasks的返回值
status_list = loop.run_until_complete(asyncio.gather(*tasks))
print(len([status_list]))
for html in status_list:
soup = BeautifulSoup(html)
soup = soup.find('ul', attrs={'class': 'inqList'})
for x in soup.find_all('li'):
aTag = x.find('a')
divTag = x.find('div')
link = aTag.get('href')
divaTag = divTag.find('a')
divbTag = divTag.find('b')
link_text = divaTag.string
if link_text:
link_texts.append(link_text)
if link:
linklist.append(link)
if divbTag:
link_num = divbTag.string
else:
link_num = 0
link_nums.append(link_num)
mydict = {"name": link_text, "socre": link_num, "url": "https://www.1905.com"+link}
inser = mycol.insert_one(mydict) #插入数据
print(link_texts)
print(linklist)
print(link_nums)
print(len(link_texts))
print(len(linklist))
print(len(link_nums))
end = time.time()
print("cost time:", end - start)读取MongoDb里面我们插入的数据
class MongoDb:
def __init__(self):
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["python"]
self.mycol = mydb["movie"]
mdb = MongoDb()
for x in mdb.mycol.find({},{ "_id": 0, "name": 1, "socre": 1,"url":1}).sort("socre",-1):
print(x)版权声明:本文为u012997396原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。