程序运行截图:

mysql代码:
create table htgs(
id int primary key,
cgdw varchar(2000),
cgmc varchar(2000),
zbdw varchar(2000),
htid varchar(2000),
htvalue varchar(2000),
zbgyskhbank varchar(2000),
zbgyskhzh varchar(2000),
hturl varchar(2000),
fbtime varchar(200)
);python代码:
# 2019/7/5
import json
import random
import re
from urllib import parse
import requests
import pymysql
# 打开数据库连接
db = pymysql.connect(host='localhost',
port=8080,
user='root',
passwd='123',
db='students',
charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
"""是否结束爬取(继增用)"""
over = False
# 通过地址获取数据
def get_data(url):
"""通过html获取页面内容"""
global over
URL = "http://www.ccgp-jiangxi.gov.cn/web/jyxx/002006/002006006/%i.html" % url
try:
respose = requests.get(URL)
except requests.exceptions.ConnectionError:
respose = requests.get(URL)
print("*" * 300)
print("开始爬取第%i页的政府采购合同公示数据!" % url)
# 获取合同公示内容
contents = re.findall(
r'<li class="ewb-list-node clearfix">.*?<a href="(.*?)" target="_blank" class="ewb-list-name">',
respose.text, re.S)
times = re.findall(
r'<span class="ewb-list-date">(.*?)</span>',
respose.text, re.S)
# print("\033[34m合同公示内容:%s" % str(contents))
print("合同公示数量:%s" % len(contents) + "条")
print("发布时间:%s" % str(times[0]))
print("*" * 300)
for temp in range(len(contents)):
# """过滤网页标签"""
# dr = re.compile(r'<[^>]+>', re.S)
# print(str(dr.sub('', time[0])))
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "JSESSIONID=FBCF1C3CA8814ACADA7E15E1BEB057A4; _CSRFCOOKIE=C7F99429DFB7175C79CB1F1ADE037756C2CFF787; EPTOKEN=C7F99429DFB7175C79CB1F1ADE037756C2CFF787",
"Host": "ggzyjy.jiangxi.gov.cn",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0"
}
"""获取合同公示地址"""
urls = "http://www.ccgp-jiangxi.gov.cn" + contents[temp]
try:
htgsUrl = requests.get(urls, headers=headers)
except requests.exceptions.ConnectionError:
htgsUrl = requests.get(urls, headers=headers)
data = re.findall(r'<div class="con" style="margin-top: 31px;"><script>(.*?)</script>', htgsUrl.text, re.S)
try:
"""通过合同公示地址获取合同公示"""
htgsdz = data[0].split("='")[1].split("'")[0]
except IndexError:
return
"""设置请求头"""
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "keep-alive",
"Content-Length": "1640",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "JSESSIONID=5B4CF658C132C1165ECFD6478FB4E88F; _CSRFCOOKIE=9D55488F828C0C4B8924A1E5BF41CE02710021DD; EPTOKEN=9D55488F828C0C4B8924A1E5BF41CE02710021DD",
"CSRFCOOKIE": "9D55488F828C0C4B8924A1E5BF41CE02710021DD",
"Host": "ggzyjy.jiangxi.gov.cn",
"Referer": htgsdz,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0",
"X-Requested-With": "XMLHttpRequest",
"Cache-Control": "max-age=0"
}
"""设置请求主体"""
datas = [
"commonDto=%5B%7B%22id%22%3A%22jianshedw%22%2C%22bind%22%3A%22dataBean.jianshedw%22%2C%22type%22%3A%22outputtext%22%7D%2C%7B%22id%22%3A%22zbdanweiname%22%2C%22bind%22%3A%22dataBean.zbdanweiname%22%2C%22type%22%3A%22outputtext%22%7D%2C%7B%22id%22%3A%22htno%22%2C%22bind%22%3A%22dataBean.htno%22%2C%22type%22%3A%22outputtext%22%7D%2C%7B%22id%22%3A%22htjine%22%2C%22bind%22%3A%22dataBean.htjine%22%2C%22type%22%3A%22outputtext%22%7D%2C%7B%22id%22%3A%22supplierbankname%22%2C%22bind%22%3A%22dataBean.supplierbankname%22%2C%22type%22%3A%22outputtext%22%7D%2C%7B%22id%22%3A%22supplierbankaccount%22%2C%22bind%22%3A%22dataBean.supplierbankaccount%22%2C%22type%22%3A%22outputtext%22%7D%2C%7B%22id%22%3A%22datagrid%22%2C%22type%22%3A%22datagrid%22%2C%22action%22%3A%22zbdwSingleModel%22%2C%22idField%22%3A%22rowguid%22%2C%22pageIndex%22%3A0%2C%22pageSize%22%3A10%2C%22sortField%22%3A%22%22%2C%22sortOrder%22%3A%22%22%2C%22columns%22%3A%5B%7B%22fieldName%22%3A%22itemno%22%7D%2C%7B%22fieldName%22%3A%22itemname%22%7D%2C%7B%22fieldName%22%3A%22totalprice%22%7D%5D%2C%22url%22%3A%22jxpHtgsDetailAction.action%3Fcmd%3DzbdwSingleModel%22%2C%22data%22%3A%5B%5D%7D%2C%7B%22id%22%3A%22datagrid2%22%2C%22type%22%3A%22datagrid%22%2C%22action%22%3A%22defaultModel%22%2C%22idField%22%3A%22rowguid%22%2C%22pageIndex%22%3A0%2C%22pageSize%22%3A30%2C%22sortField%22%3A%22%22%2C%22sortOrder%22%3A%22%22%2C%22columns%22%3A%5B%7B%22fieldName%22%3A%22attachfilename%22%7D%5D%2C%22url%22%3A%22jxpHtgsDetailAction.action%3Fcmd%3DdefaultModel%22%2C%22data%22%3A%5B%5D%7D%2C%7B%22id%22%3A%22_common_hidden_viewdata%22%2C%22type%22%3A%22hidden%22%2C%22value%22%3A%22%22%7D%5D"]
"""获取url中的rowguid"""
params = parse.parse_qs(parse.urlparse(htgsdz).query)[
"rowguid"]
"""请求合同公示内容"""
try:
htgsContentUrl = requests.post(
"http://ggzyjy.jiangxi.gov.cn/hygs/huiyuaninfo/pages/htgs/jxpHtgsDetailAction.action?cmd=page_Load&rowguid=%s&isCommondto=true" %
params[0],
data=datas[0], headers=headers)
except requests.exceptions.ConnectionError:
htgsContentUrl = requests.post(
"http://ggzyjy.jiangxi.gov.cn/hygs/huiyuaninfo/pages/htgs/jxpHtgsDetailAction.action?cmd=page_Load&rowguid=%s&isCommondto=true" %
params[0],
data=datas[0], headers=headers)
htgsContentUrl.encoding = "utf-8"
controls = json.loads(htgsContentUrl.text)["controls"]
# print(controls)
"""数据写入数据库"""
id = random.randint(0, 999999999)
"""采购单位"""
if len(controls[0]["value"]) <= 0:
cgdw = "空"
else:
cgdw = controls[0]["value"]
"""采购条目名称"""
if len(controls[6]["data"]) <= 0:
cgtmmc = "空"
else:
cgtmmc = controls[6]["data"][0]["itemname"]
"""中标单位"""
if len(controls[1]["value"]) <= 1:
zbdw = "空"
else:
zbdw = controls[1]["value"]
"""合同编号"""
if len(controls[2]["value"]) <= 1:
htbh = "空"
else:
htbh = controls[2]["value"]
"""合同金额(元)"""
if len(controls[3]["value"]) <= 1:
htje = "空"
else:
htje = controls[3]["value"]
"""中标供应商开户行"""
if len(controls[4]["value"]) <= 1:
zbgysbank = "空"
else:
zbgysbank = controls[4]["value"]
"""中标供应商开户账号"""
if len(controls[5]["value"]) <= 1:
zbgyzh = "空"
else:
zbgyzh = controls[5]["value"]
"""合同公示地址"""
htgsdz
"""发布时间"""
fbtime = str(times[0])
sql = "select count(*) from htgs where htid = %s"
cursor.execute(sql, [htbh])
for each in cursor.fetchall():
if each[0] > 0:
over = True
break
else:
try:
sql = "insert into htgs(id,cgdw,cgmc,zbdw,htid,htvalue,zbgyskhbank,zbgyskhzh,hturl,fbtime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
params = [id, cgdw, cgtmmc, zbdw, htbh, htje, zbgysbank, zbgyzh, htgsdz, fbtime]
cursor.execute(sql, params)
print(str(temp + 1) + ".[" + htbh + "]" + cgtmmc)
"""提交到数据库执行"""
db.commit()
except Exception as e:
print(e)
# """更新启用"""
# if over:
# break
else:
print("第%i页政府采购合同公示数据爬取成功!" % url)
# 获取抓取页数
def get_page():
URL = "http://www.ccgp-jiangxi.gov.cn/web/jyxx/002006/002006006/1.html"
respose = requests.get(URL)
# 获取合同公示页数
page = re.findall(
r'<span class="wb-page-default wb-page-number wb-page-family" id="index">1/(.*?)</span>',
respose.text, re.S)
return int(page[0])
# 主函数
if __name__ == '__main__':
for url in range(1, get_page()):
# 更新启用
# if over:
# print("政府采购合同公示数据继增完成!")
# break
# else:
get_data(url)
else:
print("合同公示数据爬取完成!")
cursor.close()
db.close()
程序可能存在部分bug,欢迎交流指正。
版权声明:本文为qq_35595164原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。