爬虫获取微博首页热搜

步骤:

打开微博首页 https://s.weibo.com/top/summary?
右键点击检查，分析静态网页
将爬取到的内容保存为csv文件格式

需要导入的库

import requests
from lxml import etree
import pandas as pd

话不多说，直接上源码！

import requests
from lxml import etree
import pandas as pd
url = 'https://s.weibo.com/top/summary?'
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36 Edg/91.0.864.70'
}


def get_url(url):
    try:

        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
    except requests.ConnectionError as e:
        print(e.args)


def get_hot():
    hotlist = [] #热搜内容列表，用来保存内容
    hot_url_list=[] #热搜url列表
    index_list=[] #索引号列表
    items = get_url(url) #调用函数，获取网页response.text
    html = etree.HTML(items)# 初始化
    hot_list = html.xpath('/html/body/div/section/ul/li')#xpath定位，可在浏览器直接复制
    j=1


    #遍历所有li列表
    for i in hot_list:
        #获取热搜内容
        hot = i.xpath('./a/span/text()')[0] #一直搞不懂[0]是什么意思
        hotlist.append(hot)
        #获取内容的url
        hot_url = i.xpath('./a/@href')[0]
        hot_url="https://s.weibo.com/"+str(hot_url)#需要组合正确的url，才能打开
        hot_url_list.append(hot_url)

        print(j,hot,hot_url)
        index_list.append(j)
        j=j+1
        #保存文件
        file=pd.DataFrame(data={'编号':index_list,'内容':hotlist,'url':hot_url_list})
        file.to_csv('微博热搜.csv',encoding='utf_8_sig')


#调用函数，完成爬取！
get_hot()

运行结果:
在这里插入图片描述
文件

到此，便完成了今天微博热搜的获取。

关于以上代码，要留意的就是组合url，源码是没有"https://s.weibo.com/"这一前缀的，估计是对我的考验，哈哈！

还有就是hot = i.xpath(’./a/span/text()’)[0] 后面的[0]不加会报错，但我又不知道是什么意思，还望大神指点迷津。

xpath只是略懂皮毛，知识有限，还望走过路过多多指教！

原文链接：https://blog.csdn.net/qq_47828130/article/details/118893482