爬取某市房管局房屋交易备案公开数据

需求:爬取指定网页上的房屋交易备案公开数据
需求分析:
1.目标网站中的数据地址不发生改变,不需要写params
2.目标网站没有反爬虫机制,也没有robots.txt协议,更无cookie验证,甚至不需要写请求头,但服务器响应时间较长。
3.该网站代码书写较为规范,从源码注释里看粗来像直接复制的模板。

# -*- coding: utf-8 -*-
"""
Created on Sun Sep 15 23:32:02 2019

@author: 张玄瑾
"""

import requests
from lxml import etree
import os
import pandas as pd
import time

os.chdir('C:\\Users\\张玄瑾\\Desktop\\房管局数据\\')
url = "http://123.7.16.67:88/WebIssue/ExternalServer/Samples/price.asp?QueryItem=%BD%F1%C8%D5%D7%A1%D5%AC%BE%F9%BC%DB"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}


def requestinfo(url):
    try:
        r = requests.get(url,headers = headers,timeout = 30)
        #print(r.status_code)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return(r.text)
    except:
        return('wrong in requests')

def chooseinfo(text):
    df = pd.DataFrame(columns = ['采集时间','区域','名称','成交数量','成交面积合计','每平米均价'])
    html = etree.HTML(text)
    a = html.xpath("//tr[position()>1]")
    #print(a)
    for i in a :
        temp = i.xpath(".//td/text()")
        tempseries = pd.DataFrame({'采集时间':[time.ctime()],
                                '区域':[temp[0]],
                                '名称':[temp[1]],
                                '成交数量':[temp[2]],
                                '成交面积合计':[temp[3]],
                                '每平米均价':[temp[4]]})
        #print(tempseries)
        df = df.append(tempseries)
    return(df)
    
def checkcsv(df1):
    if os.listdir():
        tempdf = pd.read_csv('Result.csv',index_col=0)
        #print(tempdf)
        newdf = pd.concat([tempdf,df1],ignore_index=True)
        #print(newdf)
        newdf.to_csv('Result.csv',encoding = 'Utf-8')
        print('存储数据完成')
    else:
        df1.to_csv('Result.csv',encoding = 'Utf-8')
        print('文件创建成功,数据已经记录')
        

def main():
    global url
    url1 = url
    r = requestinfo(url1)
    df1 = chooseinfo(r)
    checkcsv(df1)
    
main()

版权声明:本文为yz7zzxj001原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。