直接上代码了这次把代码完全优化好了
import re
import urllib.request
from bs4 import BeautifulSoup
def deal(list):
list=replacebilank(list)
list =replacestr(list)
return list
def replacebilank(list):#去空格
New_list = []
for i in list:
New_list.append(i.replace(" ", ""))
return New_list
def replacestr(list):#去无效字符
New_list = []
for i in list:
New_list.append(i.replace(" ", ""))
return New_list
def gethtml():#获取html
for i in range(0,10):
url="https://movie.douban.com/top250?start="+str(i*25)
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
}
req=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
html+=html
return html
gethtml();
def required_compile(str):#返回正则表达式内容
html = gethtml()
listgaint=[]
p4 = re.compile(str) # 获取演员信息
for four in p4.findall(html):
listgaint.append(four)
listgaint = deal(listgaint)
return listgaint
def required_deta():#返回需求集合
listname=required_compile(r'alt="(.*?)" src="')
listjpg=required_compile(r'src="(.*?)" class=')
listreword=required_compile(r'class="other"> / (.*?)<')
listgaint=required_compile(r' (.*)...<br>')
required_details = []
for i in range(len(listname)): # 将获得的信息存储在required_details = []
required_details.append([listname[i], listjpg[i], listreword[i], listgaint[i]])
print(required_details)
required_deta()
这下代码爽多了吧,想要什么在required_deta()函数里面添加正则表达式就行了, 快去试试吧!
版权声明:本文为qq_53819205原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。