Python异步高并发批量读取URL链接

   思路:先组装 需要访问的url 链接,然后用asyncio协程批量去aiohttp请求,把返回的response数据 用BeautifulSoup处理得到我们想要的结果,然后把数据插入mongo数据库。

import asyncio
import time
import aiohttp
import async_timeout
from bs4 import BeautifulSoup
import pymongo

myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb     = myclient["python"]
mycol    = mydb["movie"]

''''异步并发获取url数据,爬虫某网'''

temp_data   = []
linklist    = []
link_texts  = []
link_nums   = []

msg = "https://www.xxx.com/mdb/film/list/year-1908/o0d0p{}.html"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}

urls = [msg.format(i) for i in range(1, 7)]
print(urls) ##打印url 数组
data = bytearray()

async def fetch(session, url):
    with async_timeout.timeout(10):
        async with session.get(url) as response:
            html = await response.text(encoding=None, errors='ignore')
            return response.status,html

async def main(url):
    async with aiohttp.ClientSession() as session:
            status,html = await fetch(session, url)
            if status == 200:
                return html
            else:
                return ''

if __name__ == '__main__':
    start = time.time()
    loop = asyncio.get_event_loop()
    tasks = [main(url) for url in urls]
    # 返回一个列表,内容为各个tasks的返回值
    status_list = loop.run_until_complete(asyncio.gather(*tasks))

    print(len([status_list]))

    for html in status_list:
        soup = BeautifulSoup(html)
        soup = soup.find('ul', attrs={'class': 'inqList'})
        for x in soup.find_all('li'):
            aTag      = x.find('a')
            divTag    = x.find('div')
            link      = aTag.get('href')
            divaTag   = divTag.find('a')
            divbTag   = divTag.find('b')
            link_text = divaTag.string

            if link_text:
                link_texts.append(link_text)
            if link:
                linklist.append(link)
            if divbTag:
                link_num = divbTag.string
            else:
                link_num = 0
            link_nums.append(link_num)
            mydict = {"name": link_text, "socre": link_num, "url": "https://www.1905.com"+link}
            inser = mycol.insert_one(mydict) #插入数据

    print(link_texts)
    print(linklist)
    print(link_nums)

    print(len(link_texts))
    print(len(linklist))
    print(len(link_nums))
    end = time.time()
    print("cost time:", end - start)

读取MongoDb里面我们插入的数据

​​​​​​​class MongoDb:
    def __init__(self):
        myclient = pymongo.MongoClient("mongodb://localhost:27017/")
        mydb = myclient["python"]
        self.mycol = mydb["movie"]

mdb = MongoDb()
for x in mdb.mycol.find({},{ "_id": 0, "name": 1, "socre": 1,"url":1}).sort("socre",-1):
    print(x)


版权声明:本文为u012997396原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。