以前曾经想爬取一些汽车官网,了解一下他们在中国的店的情况,分布。然后这个网站之前爬取过,不过没成功,最近重新爬取了一下,成功了。这个网站我要的数据主要是用json储存的,需要先获得省份id,再拼接链接获取城市id,再从城市id拼接的链接获取我想要的内容。之后会做一个可视化。
import requests
import lxml
from lxml import etree
import json
import numpy as np
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36"
}
# 获取省份id
def get_province_Num():
url = "http://api.faw-benteng.com/ajax/v1_getprov.php?callback=p11"
reponse = requests.get(url, headers = headers)
jsonT = reponse.content.decode('utf-8')
jsonT = jsonT[5:-2]
# print(jsonT)
dict1= {}
list1 = eval(jsonT)
# print(list1)
for item in list1:
provid = item['id']
provname = item['name']
dict1[provid] = provname
return dict1
# 获取城市id
def getCityNum(url,provname):
# url = "http://api.faw-benteng.com/ajax/v1_getcity.php?callback=p12&prov=11"
response = requests.get(url, headers = headers)
jsonT = response.content.decode('utf-8')
jsonT= jsonT[5:-2]
# print(jsonT)
# print(len(jsonT))
if jsonT:
list1 = eval(jsonT)
# print(list1)
# print(len(list1))
dict2 = {}
if len(jsonT) <41:
cityid = list1['id']
cityname = list1['name']
dict2[cityid] = cityname
return provname,dict2
# print(provname,dict2)
elif len(list1) ==0:
return None
elif len(jsonT) >41:
for item in list1:
cityid = item['id']
cityname = item['name']
# print(cityname,cityid)
dict2[cityid] = cityname
return provname, dict2
# print(provname,dict2)
else:
return ("该网址没有经销店")
# 获取需要的字段
def getcarurl(pageprov,cityname,url):
response = requests.get(url, headers = headers)
if response.status_code == 200:
html = response.content.decode('utf-8')
data = json.loads(html)
data = data.values()
for item in data:
yield {
'pageprov':pageprov,
'cityname':cityname,
'attr': item['attr'],
'biz_name': item['biz_name'],
'sale_phone': item['sale_phone'],
'serv_phone': item['serv_phone'],
'address': item['address'],
'zip': item['zip']
}
else:
return None
# 保存
def save_to_txt():
html = getcarurl(pageprov,cityname,url)
with open('奔腾经销商.txt','a+',encoding='utf-8',errors='ignore') as f:
for item in html:
print(item)
f.write(str(pageprov+','+cityname +','+
item['attr']+','+ item['biz_name']+','+
item['sale_phone']+','+item['serv_phone']+
item['address']+','+ item['zip']+'\n'))
# print(mt)
if __name__ == '__main__':
pageprov = get_province_Num()
# print(pageprov)
# 获得省份链接跟省份名字
for provid,provname in pageprov.items():
url = "http://api.faw-benteng.com/ajax/v1_getcity.php?callback=p12&prov=%d" %int(provid)
city = getCityNum(url,provname)
# 获得城市链接跟城市名
pageprov = city[0]
city = city[1]
# print(pageprov,city)
for cityid,cityname in city.items():
url = "http://api.faw-benteng.com/ajax/v1_getdealer.php?brand=benteng&city=%d"%int(cityid)
# print(cityid,cityname)
getcarurl(pageprov,cityname,url)
save_to_txt()版权声明:本文为GZ_Wiilian原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。