perl脚本爬虫程序,支持爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等

【实例简介】

采用perl脚本写的爬虫程序,可以爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等

【实例截图】

在这里插入图片描述
文件:590m.com/f/25127180-494436243-4727a5(访问密码:551685)

【核心代码】

抓取代码一:

#!/usr/bin/perl -w
 
package Parser_ifeng;
use Exporter;
our @ISA=qw(Exporter);
 
our @EXPORT=qw(parser_ifeng);
 
#use strict;
use warnings;
use LWP::UserAgent;
use LWP::Simple;
use HTML::TokeParser;
use Encode;
use utf8;
use Time::Local;
use Mysql_conn;
 
#binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
#binmode(STDERR, ':encoding(utf8)');
 
 
my $workdir="/home/webscience/proj/";
my $pmdir="$workdir/pm/";
my $logdir="$workdir/log/";
my $logfile="$logdir/ifeng.log";
my $logfp;
my $tmpdir="$workdir/tmp/";
my $tmpfile="$tmpdir/ifeng.tmp";
my $tmpfile1="$tmpdir/ifeng1.tmp";
my %board_list=();
my $main_url="http://www.tianya.cn/bbs/index.shtml";
 
 
 
sub parser_ifeng
{
 
    #初始化日志文件
    open($logfp,">>$logfile") or die "can't open file:$!";
 
    $board_list{'点击排行榜'}="http://news.ifeng.com/toprank/day/";
    $board_list{'评论排行榜'}="http://news.ifeng.com/comment/day/";
    $board_list{'表情_感动'}="http://news.ifeng.com/mood/day/1/";
    $board_list{'表情_同情'}="http://news.ifeng.com/mood/day/2/";
    $board_list{'表情_无聊'}="http://news.ifeng.com/mood/day/3/";
    $board_list{'表情_愤怒'}="http://news.ifeng.com/mood/day/4/";
    $board_list{'表情_搞笑'}="http://news.ifeng.com/mood/day/5/";
    $board_list{'表情_难过'}="http://news.ifeng.com/mood/day/6/";
    $board_list{'表情_高兴'}="http://news.ifeng.com/mood/day/7/";
    $board_list{'表情_路过'}="http://news.ifeng.com/mood/day/8/";
 
        my $key;
        my $value;
    local %url_hash;#用于去除重复的新闻
        while(($key, $value) = each %board_list){
        my %board;
        $board{'board_name'}=$key;
        $board{'board_url'}=$value;
        $board{'parent_name'}="凤凰网";
        #获取版块id,若是新版块,则写入数据库
        #------
        my ($fid, $parent_id) = insert2forum(%board);
        $board{'board_id'} = $fid;
        $board{'parent_id'} = $parent_id;
        parser_board(%board);
    }
     
    phash(%board_list);#test
}
 
 
sub parser_board(%)
{
    my %attr=();
    my $token;
    my $page;
    my $p;
    my %board=@_;
    my $url=$board{'board_url'};
    my @poster_list;
     
        my $tmp;
    my $i=0;
    #获取当前时间
    my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
    $t_year =1900;
    $t_mon =1;
    #unix时间
    local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900);
 
    print "board_url=$url\n";
        $page=get($url);
 
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);   
     
    #解析新闻列表
        while ($token = $p->get_tag("div")) {
                %attr=%{$token->[1]};
        if(!$attr{'class'} || $attr{'class'} ne 'listPublic')
            {next;}
 
        $token = $p->get_tag("a");
                %attr=%{$token->[1]};
 
        #去除重复的新闻    
        my $href=$attr{'href'};
        if($url_hash{"$href"})
            {next;  }
 
        $i  ;
        $poster_list[$i]->{'poster_url'}=$attr{'href'};
        $poster_list[$i]->{'title_name'}=$p->get_trimmed_text("/a");
        $poster_list[$i]->{'board_name'}=$board{'board_name'};
        $poster_list[$i]->{'board_id'} = $board{'board_id'}; #Add board id
        $poster_list[$i]->{'parent_id'} = $board{'parent_id'};   #Add board id
        $url_hash{"$href"}=$poster_list[$i]->{'title_name'};
             
#       phash(%{$poster_list[$i]});exit;
        }
 
    $i=1;
    foreach $poster (@poster_list)
    {
        if(!$poster || $poster eq ""){next;}
        phash(%$poster);
         
        my ($code, $poster, $reply) = parser_poster(\%$poster);
        #if($code == 0){
        #   insert2topic($board{'board_id'}, \%$poster, \@$reply);
        #}      
        $i  ;   
    }
}
 
#获取主贴信息
sub parser_poster
{
    my %attr=();
    my $page;
    my $tmp;
    my $p;
    my $token;  
    my $poster=$_[0];
    local @reply;
    my $url=$poster->{'poster_url'};
     
        $page=get($url);
        utf8::decode($page);
        $p= HTML::TokeParser->new(\$page);   
 
    #定位到论坛开始 主贴内容开始标识<div id="artical_sth">   
        while($token=$p->get_tag('div')){
                %attr=%{$token->[1]};
                if($attr{'id'} && $attr{'id'} eq 'artical_sth')
                        {last;}
        }
    if(!$token){
        wlog("$url 格式错误,跳过此帖"); 
        return -1;
    }
 
    #抓取发帖时间
    $token=$p->get_tag('span');
    $tmp=$p->get_trimmed_text('/span');
    $pattern='([\d]*)年([\d]*)月([\d]*)日 ([\d]*):([\d]*)';
    if(!($tmp =~ m/$pattern/)){
        wlog("$url 时间获取失败,跳过此帖");   
        return -1;
    }
    $poster->{'post_time'}="$1-$2-$3 $4:$5:00";      
     
    #抓取新闻来源
    $token=$p->get_tag('span');
    $poster->{'info_source'}=$p->get_trimmed_text('/span');
 
    #抓取评论数
    $token=$p->get_tag('span');
    $poster->{'reply_num'}=$p->get_trimmed_text('/span');
 
    #查找主贴内容 定位到<div id="clear">   
    while($token=$p->get_token()){
        if($token->[0] eq "C"&& $token->[1] eq "<!--mainContent begin-->" )
            {last;}
    }
    if(!$token){
        wlog("$url 抓取内容失败,跳过此帖");   
        return -1;
    }
    $poster->{'content'}=$p->get_trimmed_text('div');
     
    #将帖子内容和回复数据写入数据库
    #insert2topic("topic", %$poster);
    phash(%$poster);#test
    #sqldeal(\%$poster,\@reply);
    insert2news(\%$poster);
    return 0;
}
 
#获取回复信息
sub get_reply()
{
    my $p=$_[0];
    my $tmp;
    my $page;
    my $i=1;
    my $url;
    my %attr;
    my %attr1;
    my $token;
    my $token1;
 
R_BEGIN: while($token=$p->get_tag('table','div')){#'td')){
        %attr=%{$token->[1]};
        #<td  id="pageDivBottom">,定位下一页url
        if($token->[0] eq 'div' && $attr{'id'} && $attr{'id'} eq 'adsp_content_banner_2'){
####        if($token->[0] eq 'td' && $attr{'id'} && $attr{'id'} eq 'pageDivBottom'){
            while($token=$p->get_tag('a')){
                if($p->get_trimmed_text('/a') eq '下一页'){
                    %attr=%{$token->[1]};
                    $url=$attr{'href'};
                        $page=get($url);
                        utf8::decode($page);
                        $p= HTML::TokeParser->new(\$page);
 
                    #重新定位到<div id="pContentDiv">  
                    while($token=$p->get_tag('div')){
                        %attr=%{$token->[1]};
                        if($attr{'id'} && $attr{'id'} eq 'pContentDiv')
                            {next R_BEGIN;}
                    }
                }
            }       
            last R_BEGIN;#没有下一页,已经是最后一页,退出R_BEGIN
        }elsif($token->[0] eq 'div'){
            next;
        }
 
                ##抓取回复, get_tag('table')
                #获取回帖用户id
                $p->get_tag('a');
        $reply[$i]->{'uid'}=$p->get_trimmed_text('/a');
 
                #获取回帖时间
                $tmp=$p->get_trimmed_text('/td');
                $patern='回复日期:([\d\D]*)';
                if($tmp=~m/$patern/)
                        {$reply[$i]->{'reply_time'}=$1;}
 
                #获取回帖内容
                $reply[$i]->{'content'}=$p->get_trimmed_text('table','div');
                $i  ;
    }
}
 
#打印reply数组内容
sub preply(@)
{
    my @reply=@_;
    my $i;
 
    for($i=1;$i<=$#reply;$i  )
    {print "[$i]\n";phash(%{$reply[$i]});}
}
 
#写入日志到
sub wlog($)
{
    my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime();
    $t_year =1900;
    $t_mon =1;
        print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
        print  "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n";
}
 
#打印hash表内容
sub phash(%)
{
    my $key;
    my $value;
    my %hash =@_;   
    my $i=1;
 
    while(($key, $value) = each %hash){   print $i  ,"  $key = $value\n"; } #test
    print "\n";
}
 
sub sqldeal(\%\@)
{
    my %poster =%{$_[0]};
    my @reply =@{$_[1]};
 
#   phash(%$poster);#test
#   preply(@reply);#test
 
}
 
1;

以下内容无关:

-------------------------------------------分割线---------------------------------------------

开发中经常会遇到树形结构的场景,比如:导航菜单、组织机构等等,但凡是有这种父子层级结构的都是如此,一级类目、二级类目、三级类目。。。

对于这种树形结构的表要如何设计呢?接下来一起探讨一下

首先,想一个问题,用非关系型数据库存储可不可以?

答案是肯定可以的,比如用mongoDB,直接将整棵树存成json。但是,这样不利于按条件查询,当然也取决于具体的需求,抛开需求谈设计都是耍流氓。

在菜单这个场景下,一般还是用关系型数据库存储,可以将最终的查询结构缓存起来。

常用的方法有四种:

每一条记录存parent_id
每一条记录存整个tree path经过的node枚举
每一条记录存 nleft 和 nright
维护一个表,所有的tree path作为记录进行保存

第一种:每条记录存储parent_id

这种方式简单明了,但是想要查询某个节点的所有父级和子级的时候比较困难,势必需要用到递归,在mysql里面就得写存储过程,太麻烦了。

当然,如果只有两级的话就比较简单了,自连接就搞定了,例如:

第四种:单独用一种表保存节点之间的关系

复制代码
CREATE TABLE city (
id int(11) NOT NULL AUTO_INCREMENT,
name varchar(16),
PRIMARY KEY (id) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8mb4;

CREATE TABLE city_tree_path_info (
id int(11) NOT NULL AUTO_INCREMENT,
city_id int(11) NOT NULL,
ancestor_id int(11) NOT NULL COMMENT ‘祖先ID’,
level tinyint(4) NOT NULL COMMENT ‘层级’,
PRIMARY KEY (id) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8mb4;
复制代码
上面这个例子中,city表代表城市,city_tree_path_info代表城市之间的层级关系,ancestor_id表示父级和祖父级ID,level是当前记录相对于ancestor_id而言的层级。这样就把整个层级关系保存到这张表中了,以后想查询某个节点的所有父级和子级就很容易了。

最后,我发现构造这种层级树最简单的还是用java代码

java递归生成菜单树

Menu.java

1 package com.example.demo.model;
2
3 import lombok.AllArgsConstructor;
4 import lombok.Data;
5 import lombok.NoArgsConstructor;
6
7 import java.util.List;
8
9 @AllArgsConstructor
10 @NoArgsConstructor
11 @Data
12 public class Menu {
13
14 /**
15 * 菜单ID
16 /
17 private Integer id;
18
19 /
*
20 * 父级菜单ID
21 /
22 private Integer pid;
23
24 /
*
25 * 菜单名称
26 /
27 private String name;
28
29 /
*
30 * 菜单编码
31 /
32 private String code;
33
34 /
*
35 * 菜单URL
36 /
37 private String url;
38
39 /
*
40 * 菜单图标
41 /
42 private String icon;
43
44 /
*
45 * 排序号
46 /
47 private int sort;
48
49 /
*
50 * 子级菜单
51 */
52 private List

children;
53
54 public Menu(Integer id, Integer pid, String name, String code, String url, String icon, int sort) {
55 this.id = id;
56 this.pid = pid;
57 this.name = name;
58 this.code = code;
59 this.url = url;
60 this.icon = icon;
61 this.sort = sort;
62 }
63
64 }
Test.java

1 package com.example.demo.model;
2
3 import com.fasterxml.jackson.core.JsonProcessingException;
4 import com.fasterxml.jackson.databind.ObjectMapper;
5
6 import java.util.ArrayList;
7 import java.util.Comparator;
8 import java.util.List;
9 import java.util.stream.Collectors;
10
11 public class Hello {
12 public static void main(String[] args) throws JsonProcessingException {
13 List

allMenuList = new ArrayList<>();
14 allMenuList.add(new Menu(1, 0, “湖北”, “HuBei”, “/a”, “a”, 3));
15 allMenuList.add(new Menu(2, 0, “河南”, “HeNan”, “/b”, “b”, 2));
16 allMenuList.add(new Menu(3, 1, “宜昌”, “YiChang”, “/c”, “c”, 2));
17 allMenuList.add(new Menu(4, 2, “信阳”, “XinYang”, “/d”, “d”, 1));
18 allMenuList.add(new Menu(5, 1, “随州”, “SuiZhou”, “/e”, “e”, 1));
19 allMenuList.add(new Menu(6, 5, “随县”, “SuiXian”, “/f”, “f”, 2));
20 allMenuList.add(new Menu(7, 3, “枝江”, “ZhiJiang”, “/g”, “g”, 2));
21
22 // 一级菜单
23 ListparentList = allMenuList.stream().filter(e->e.getPid()==0).sorted(Comparator.comparing(Menu::getSort)).collect(Collectors.toList());
24 // 递归调用,为所有一级菜单设置子菜单
25 for (Menu menu : parentList) {
26 menu.setChildren(getChild(menu.getId(), allMenuList));
27 }
28
29 ObjectMapper objectMapper = new ObjectMapper();
30 System.out.println(objectMapper.writeValueAsString(parentList));
31 }
32
33 /**
34 * 递归查找子菜单
35 * @param id 当前菜单ID
36 * @param allList 查找菜单列表
37 * @return
38 */
39 public static ListgetChild(Integer id, ListallList) {
40 // 子菜单
41 ListchildList = new ArrayList<>();
42 for (Menu menu : allList) {
43 if (menu.getPid().equals(id)) {
44 childList.add(menu);
45 }
46 }
47
48 // 为子菜单设置子菜单
49 for (Menu nav : childList) {
50 nav.setChildren(getChild(nav.getId(), allList));
51 }
52
53 // 排序
54 childList = childList.stream().sorted(Comparator.comparing(Menu::getSort)).collect(Collectors.toList());
55
56 if (childList.size() == 0) {
57 // return null;
58 return new ArrayList<>();
59 }
60 return childList;
61 }
62 }
结果:

1 [
2 {
3 “id”:2,
4 “pid”:0,
5 “name”:“河南”,
6 “code”:“HeNan”,
7 “url”:"/b",
8 “icon”:“b”,
9 “sort”:2,
10 “children”:[
11 {
12 “id”:4,
13 “pid”:2,
14 “name”:“信阳”,
15 “code”:“XinYang”,
16 “url”:"/d",
17 “icon”:“d”,
18 “sort”:1,
19 “children”:[]
20 }
21 ]
22 },
23 {
24 “id”:1,
25 “pid”:0,
26 “name”:“湖北”,
27 “code”:“HuBei”,
28 “url”:"/a",
29 “icon”:“a”,
30 “sort”:3,
31 “children”:[
32 {
33 “id”:5,
34 “pid”:1,
35 “name”:“随州”,
36 “code”:“SuiZhou”,
37 “url”:"/e",
38 “icon”:“e”,
39 “sort”:1,
40 “children”:[
41 {
42 “id”:6,
43 “pid”:5,
44 “name”:“随县”,
45 “code”:“SuiXian”,
46 “url”:"/f",
47 “icon”:“f”,
48 “sort”:2,
49 “children”:[]
50 }
51 ]
52 },
53 {
54 “id”:3,
55 “pid”:1,
56 “name”:“宜昌”,
57 “code”:“YiChang”,
58 “url”:"/c",
59 “icon”:“c”,
60 “sort”:2,
61 “children”:[
62 {
63 “id”:7,
64 “pid”:3,
65 “name”:“枝江”,
66 “code”:“ZhiJiang”,
67 “url”:"/g",
68 “icon”:“g”,
69 “sort”:2,
70 “children”:[]
71 }
72 ]
73 }
74 ]
75 }
76 ]