python爬取豆瓣top250代码优化版

直接上代码了这次把代码完全优化好了

import re
import urllib.request
from bs4 import BeautifulSoup

def deal(list):
   list=replacebilank(list)
   list =replacestr(list)
   return list
def replacebilank(list):#去空格
    New_list = []
    for i in list:
        New_list.append(i.replace(" ", ""))
    return New_list
def replacestr(list):#去无效字符
    New_list = []
    for i in list:
        New_list.append(i.replace(" ", ""))
    return New_list


def gethtml():#获取html
    for i in range(0,10):

            url="https://movie.douban.com/top250?start="+str(i*25)
            headers={
                 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
            }
            req=urllib.request.Request(url=url,headers=headers)
            response=urllib.request.urlopen(req)
            html=response.read().decode("utf-8")
            html+=html


    return html
gethtml();
def required_compile(str):#返回正则表达式内容
    html = gethtml()
    listgaint=[]
    p4 = re.compile(str)  # 获取演员信息
    for four in p4.findall(html):
        listgaint.append(four)
    listgaint = deal(listgaint)
    return listgaint
def required_deta():#返回需求集合

    listname=required_compile(r'alt="(.*?)" src="')
    listjpg=required_compile(r'src="(.*?)" class=')
    listreword=required_compile(r'class="other">&nbsp;/&nbsp;(.*?)<')
    listgaint=required_compile(r'    (.*)...<br>')
    required_details = []
    for i in range(len(listname)):  # 将获得的信息存储在required_details = []
        required_details.append([listname[i], listjpg[i], listreword[i], listgaint[i]])
    print(required_details)
required_deta()

这下代码爽多了吧,想要什么在required_deta()函数里面添加正则表达式就行了, 快去试试吧!


版权声明:本文为qq_53819205原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。