爬虫思路:
1.确定url
2.发送请求 requests
3.解析数据
4.保存数据(本地)
关键库:requests,re,csv,pprint(用与console看数据)
一.利用lagou的一个接口获取全国城市。(共318个,好像不全不过仅供学习参考)
def getcitys():
getcityurl = 'https://www.lagou.com/lbs/getAllCitySearchLabels.json'
headers = {'cookie': 'JSESSIONID=ABAAABAABEIABCI2827CF8DDD33BE694A53A588393F69EF',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
recity = requests.get(url=getcityurl,headers=headers)
rejson = recity.text
rcitys =re.findall(r'"name":"(.*?)"',rejson)
citys = []
for city in rcitys:
citys.append(city)
return citys;
二.利用KFC官网的一个接口获取全国城市KFC门店信息。
def getkfc(city):
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data = {'cname':'',
'pid':'',
'keyword':'{}'.format(city),
'pageIndex':'1',
'pageSize':'10000'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
#请求数据
response = requests.post(url=base_url,data=data,headers=headers)
json = response.json()
#pprint.pprint(json)
#解析数据
list_data = json['Table1']
#pprint.pprint(list_data)
#记录当前城市
with open('data.csv',mode='a',newline='',encoding='utf-8')as csvfile:
csvfile.write("当前城市为:{}".format(city))
csvfile.write('\n')
for data in list_data:
rnumb = data['rownum']
storeName = data['storeName']
cityName = data['cityName']
addressDetail = data['addressDetail']
pro = data['pro']
provinceName = data['provinceName']
print(rnumb,provinceName,cityName,storeName,addressDetail,pro)
#保存数据(本地\数据库)
with open('data.csv',mode='a',newline='',encoding='utf-8')as csvfile:
csv_writer = csv.writer(csvfile,delimiter=',')
csv_writer.writerow([rnumb,provinceName,cityName,storeName,addressDetail,pro])
main函数
if __name__ == '__main__':
citys = getcitys()
#print(len(citys))
for city in citys:
print("******当前城市为:{}******".format(city))
getkfc(city)
无proxies,无proxies,无proxies?,效率很慢。。。。。
版权声明:本文为qq_40375113原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。