需求:爬取指定网页上的房屋交易备案公开数据
需求分析:
1.目标网站中的数据地址不发生改变,不需要写params
2.目标网站没有反爬虫机制,也没有robots.txt协议,更无cookie验证,甚至不需要写请求头,但服务器响应时间较长。
3.该网站代码书写较为规范,从源码注释里看粗来像直接复制的模板。
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 15 23:32:02 2019
@author: 张玄瑾
"""
import requests
from lxml import etree
import os
import pandas as pd
import time
os.chdir('C:\\Users\\张玄瑾\\Desktop\\房管局数据\\')
url = "http://123.7.16.67:88/WebIssue/ExternalServer/Samples/price.asp?QueryItem=%BD%F1%C8%D5%D7%A1%D5%AC%BE%F9%BC%DB"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
def requestinfo(url):
try:
r = requests.get(url,headers = headers,timeout = 30)
#print(r.status_code)
r.raise_for_status()
r.encoding = r.apparent_encoding
return(r.text)
except:
return('wrong in requests')
def chooseinfo(text):
df = pd.DataFrame(columns = ['采集时间','区域','名称','成交数量','成交面积合计','每平米均价'])
html = etree.HTML(text)
a = html.xpath("//tr[position()>1]")
#print(a)
for i in a :
temp = i.xpath(".//td/text()")
tempseries = pd.DataFrame({'采集时间':[time.ctime()],
'区域':[temp[0]],
'名称':[temp[1]],
'成交数量':[temp[2]],
'成交面积合计':[temp[3]],
'每平米均价':[temp[4]]})
#print(tempseries)
df = df.append(tempseries)
return(df)
def checkcsv(df1):
if os.listdir():
tempdf = pd.read_csv('Result.csv',index_col=0)
#print(tempdf)
newdf = pd.concat([tempdf,df1],ignore_index=True)
#print(newdf)
newdf.to_csv('Result.csv',encoding = 'Utf-8')
print('存储数据完成')
else:
df1.to_csv('Result.csv',encoding = 'Utf-8')
print('文件创建成功,数据已经记录')
def main():
global url
url1 = url
r = requestinfo(url1)
df1 = chooseinfo(r)
checkcsv(df1)
main()
版权声明:本文为yz7zzxj001原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。