News crawler based on php, inserted into WordPress database

This reptile has been writing for a long time. It hasn't updated its blog for a long time.

1. The first idea is: you can easily and quickly grab the web page through the curl Φ setopt() function of php.

2. What kind of news attracts people? Of course, hot news. Here choose Baidu's search billboard to get a list of hot keywords.

3. For the convenience of filtering, we screen the news of Sohu. Because Sohu is searching news through Sogou. So search Baidu hot keywords one by one through Sogou, open the corresponding results, and filter out the news links of Sohu.

4. Enter Sohu News. Get news data, filter content and repeat filtering.

5. Insert WordPress database to get your own news link

6. Own news link is submitted to Baidu for collection.

 

spider.class.php

 1 <?php
 2 //Web crawler 
 3 class spider{
 4     public $curl;
 5     public $timeout = 5;
 6     //Time when the link was attempted
 7     public $data;
 8     public $fromUrl;
 9     //initialization
10     public function __construct($url) {//Constructor
11         $this -> fromUrl=$url;
12         $this -> curl = curl_init();
13         //amount to header Li Accept-Encoding>>Prevent random code
14         curl_setopt($this -> curl, CURLOPT_ENCODING, "");
15         // Set what you need to grab URL
16         curl_setopt($this -> curl, CURLOPT_URL, $url);
17         // Set up header
18         curl_setopt($this -> curl, CURLOPT_HEADER, 0);
19         // Set up cURL Parameter that requires the result to be saved in a string or output to the screen. If you want to get content but not output it,Use CURLOPT_RETURNTRANSFER parameter,And set to a value other than 0/true!
20         curl_setopt($this -> curl, CURLOPT_RETURNTRANSFER, 1);
21         //parameter CURLOPT_CONNECTTIMEOUT Usually used to set curl Time when the link was attempted
22         curl_setopt($this -> curl, CURLOPT_CONNECTTIMEOUT, $this -> timeout);
23         // CURLOPT_USERAGENT,It allows you to customize the client name that the request is,
24         curl_setopt($this -> curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36');
25         // Function cURL,Request web page
26         $this -> data = curl_exec($this -> curl);
27         if($this -> data){
28             $wcharset = preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this -> data,$temp) ? strtolower($temp[1]):"";//Get page code
29             if($temp[1]!='utf-8' && $temp[1]!=''){
30                 $this -> data = iconv($temp[1], "utf-8//IGNORE", $this -> data);//convert to utf8
31             }
32             
33         }else{
34             return FALSE;
35         }
36     }
37 }
38 
39 ?>

mysql.class.php

 1 <?php
 2 //Database operations
 3 header('Content-Type: text/html; charset=UTF-8');
 4 $conn = mysql_connect("10.10.10.10","","");
 5     mysql_select_db('cn_bjxxw_wenzhang');
 6     mysql_query("set names utf8");
 7     if (!$conn){
 8       die('Database connection failed: ' . mysql_error());
 9     }
10 class mysqlJi{//data base
11     static function mysqlQu($sql){//sql Sentence
12         return mysql_query($sql);        
13     }
14     static function mysqlFeAs($result){//Ergodic result
15         $i=0;//variable i
16         while($row = mysql_fetch_assoc($result)){
17         $reslist[$i] =$row;
18         $i++;
19         }
20         return $reslist;    
21     }
22     static function jsonEn($reslist){//output json
23         echo json_encode($reslist);
24     }
25     static function mysqlRows(){ //Rows affected
26         $info_str = mysql_info(); //Function returns the information of the latest query. Returns information about the statement if it succeeds, or if it fails false
27         $a_rows = mysql_affected_rows(); //Rows affected
28         preg_match("([0-9]*)", $info_str, $r_matched); 
29         return ($a_rows < 1)?($r_matched[1]?$r_matched[1]:0):$a_rows; 
30          
31     }
32     static function insertId(){//Last recorded ID
33         $getID=mysql_insert_id();//$getID Which is the last record ID
34         return $getID;
35     }
36     static function mysqlCl(){//close database
37         mysql_close();
38     }
39 }
40 ?>

baiduPush.class.php

 1 <?php
 2 //Baidu includes active submission
 3 class baiduPush{
 4     public $result;//Return results
 5     public function __construct($urls,$api){//Constructor//(pass array, api)
 6         $ch = curl_init();
 7         $options =  array(
 8             CURLOPT_URL => $api,
 9             CURLOPT_POST => true,
10             CURLOPT_RETURNTRANSFER => true,
11             CURLOPT_POSTFIELDS => implode("\n", $urls),
12             CURLOPT_HTTPHEADER => array('Content-Type: text/plain'),
13         );
14         curl_setopt_array($ch, $options);
15         $this->result = curl_exec($ch);
16     }
17 }
18 
19 ?>

info.php

 1 <?php
 2 //This page is for news details
 3 include ("mysql.class.php");//Import database related
 4 date_default_timezone_set("PRC");//time zone
 5 include ("spider.class.php");
 6 include("baiduPush.class.php");
 7 class spider_cont extends spider{    
 8     public $title = array();//Filtered title  
 9     public $stime = array();//Time after filtering  
10     public $screenData = array();//Filtered content
11     public $classNew;//News classification
12     //Data filtering (html tag, attribute id class, attribute value) / / Sohu News
13     function sohuCon($fenlei){
14         
15     if (preg_match('/top\-pager\-current/',$this->data,$if_page1)) {
16         print "------Data vent";
17     } else {
18 
19         preg_match('/<div[^>]*itemprop="articleBody"[^>]*>(.*?) seo/si',$this->data,$this->screenData);
20         if($this->screenData[0]==''){
21         //take out div Label and id by contentText Contents of and stored in a two-dimensional array $screenData in   
22             preg_match('/<div[^>]*id="contentText"[^>]*>(.*?) seo/si',$this->data,$this->screenData);
23               if($this->screenData[0]==''){
24                 preg_match('/<div[^>]*id="contentText"[^>]*>(.*?) -->/si',$this->data,$this->screenData);            
25                 preg_match_all('/<h1(.*?)>(.*?)<\/h1>/si',$this->data,$tit2);//Title
26                 $this->title[0]=$tit2[0][1];                        
27             }else{
28                 preg_match('/<h1(.*?)>(.*?)<\/h1>/si',$this->data,$this->title);//Title
29                 
30             }            
31         }else{
32                 preg_match('/<h1(.*?)>(.*?)<\/h1>/si',$this->data,$this->title);//Title
33             
34         }
35 
36         
37         //Filter Tags
38         $this->screenData[0]=preg_replace("/media_span_url\(\'(.*?)\'\)/si","",$this->screenData[0]); //filter head Separate filtering of labels            
39         $this->screenData[0]=$this->guolv($this->screenData[0], "<img><div><span><p>");    //Content filtering
40 
41         $this->title[0]=$this->guolv($this->title[0], "");    //Title Filtering
42         preg_match('/<span itemprop="name">(.*?)<\/span>/si',$this->data,$laiyuan);//source
43         $g_laiyuan=$this->guolv($laiyuan[0], "");    //Source filtering
44         if($this->screenData[0] && $this->title[0] && $g_laiyuan!="Beijing News"){
45             //Database execution
46             $atime=date('y-m-d H:i:s',time());
47             $this->sqlDo($this->title[0],$atime,$this->screenData[0],$g_laiyuan,$this->fromUrl,$fenlei);
48             
49         }else{
50             echo $this->title[0].$g_laiyuan."-----Data is empty<br>";
51         }            
52     }
53 }
54     //Data statement operation--- Corresponding WordPress data base
55     function sqlDo($atitle,$atime,$acontent,$fromName,$biaoshi,$fenlei){//(Article title,time,content,Source name,Source identification url,classification)
56     
57             $getBiaoshi=mysqlJi::mysqlQu("SELECT COUNT(*) as biaoshi FROM `wp_posts` WHERE post_content_filtered='{$biaoshi}';");
58             $acount=mysqlJi::mysqlFeAs($getBiaoshi);//Find if the data already exists in the database
59             if($acount[0]['biaoshi']==0){
60                 $res=mysqlJi::mysqlQu("INSERT INTO `wp_posts` (`ID`, `post_author`, `post_date`, `post_date_gmt`, `post_content`, `post_title`, `post_excerpt`, `post_status`, `comment_status`, `ping_status`, `post_password`, `post_name`, `to_ping`, `pinged`, `post_modified`, `post_modified_gmt`, `post_content_filtered`, `post_parent`, `guid`, `menu_order`, `post_type`, `post_mime_type`, `comment_count`, `laiyuan`, `description`) VALUES ('', '9192206', '{$atime}', '{$atime}', '{$acontent}', '{$atitle}', '', 'publish', 'open', 'open', '', '%e6%9e%81%e5%8c%96%e6%b3%a2', '', '', '{$atime}', '{$atime}', '{$biaoshi}', '0', '', '0', 'post', '', '0', '{$fromName}', '')");//sql Sentence
61                 $rows=mysqlJi::mysqlRows();//Rows affected
62                 if($rows==1){
63                     $inid=mysqlJi::insertId();//Last recorded ID    
64                     $urls = array('http://wenzhang.bjxxw.com/archives/'.$inid.'.html');//Submission address
65                     $baiduApi = 'http://data.zz.baidu.com/urls?site=wenzhang.bjxxw.com&token=GKEJ4ENj4i6PMu51';//Baidu api
66                     $aBaiduPush= new baiduPush($urls,$baiduApi);    //Baidu auto submit
67                     echo $aBaiduPush->result;
68                     $res2=mysqlJi::mysqlQu("INSERT INTO `wp_term_relationships` (`object_id`, `term_taxonomy_id`) VALUES ('{$inid}', '{$fenlei}')");//sql Sentence
69                     $rows2=mysqlJi::mysqlRows();//Rows affected
70                     echo $atitle."----source".$fromName."----insert id by".$inid."----Success<br>";
71                 }else{
72                     echo $atitle."----source".$fromName."----fail<br>";
73                 }                
74             }else{
75                 echo "--Database already exists--";
76             }
77     }
78     //Filter all html Label(data,In addition to which tags)
79     function guolv($data,$chule){
80         return strip_tags($data,$chule);//except img Label
81     }
82     //End
83     function __destruct(){
84         curl_close($this->curl);
85     }
86 }
87 //$aNewcont= new spider_cont("http://mil.sohu.com/20160905/n467640017.shtml");
88 //$aNewcont->sohuCon(3013);
89 
90 ?>

getRollNews.php

  1 <?php
  2 //This page is for news list
  3 //Introduction details page
  4 include ("info.php");
  5 ///Web crawler list
  6 class spider_list extends spider {
  7     //News list
  8     function screen_list($zhengze, $fenlei) {
  9         $this -> data = $this -> guolv($this -> data, '<td>');
 10         //Filter all html Label(data,In addition to which tags)
 11         $this -> data = preg_replace("/search/si", "", $this -> data);
 12         //filter head Separate filtering of labels
 13         //Filter filter
 14         preg_match_all($zhengze, $this -> data, $regArr, PREG_SET_ORDER);
 15         //Defining a one-dimensional array
 16         $array = array();
 17         for ($i = 0; $i < count($regArr); $i++) {//Two dimensional array to one dimensional array
 18             $array[$i] = $regArr[$i][0];
 19         }
 20         //Remove duplicates
 21         $array = array_unique($array);
 22         //Remove array key name
 23         $array = array_values($array);
 24         for ($i = 0; $i < count($array); $i++) {//Find all matching links
 25             //URL list directory
 26             if ($array[$i]) {
 27                 //                    //Perform content acquisition
 28                 $sohuList = "/http:\/\/([\.a-z]+)\.sohu\.com\/20(\d+)\/n(\d+)\.shtml/";
 29                 //Sohu list filtering rules
 30                 $new = strip_tags($array[$i]);
 31                 //Remove html sign
 32                 $new = trim($new);
 33                 //To blank
 34                 echo "<hr>".($i+1)."adopt:<em>" . $new . "</em> Search to::";
 35                 $ser = rawurlencode($new);
 36                 //convert to url
 37                 $sohuUrl = new spider_sohu("http://news.sogou.com/news?query=site%3Asohu.com+" . $ser);
 38                 $sohuUrl -> screen_list($sohuList, $fenlei);
 39             }
 40         }
 41 
 42     }
 43 
 44     //Filter all html Label(data,In addition to which tags)
 45     function guolv($data, $chule) {
 46         return strip_tags($data, $chule);
 47     }
 48 
 49     //End
 50     function __destruct() {
 51         curl_close($this -> curl);
 52     }
 53 
 54 }
 55 
 56 ///Web crawler list
 57 class spider_sohu extends spider {
 58     //News list
 59     function screen_list($zhengze, $fenlei) {
 60 
 61         preg_match('/<h3 class="vrTitle">(.*?)<\/h3>/si', $this -> data, $gulv1);
 62         //filter
 63         $this -> data = $this -> guolv($gulv1[0], '<a><h3>');
 64         //Filter all html Label(data,In addition to which tags)
 65 
 66 //        var_dump($laiyuan[0]);
 67         //Filter filter
 68         preg_match($zhengze, $this -> data, $regArr);
 69         echo $regArr[0]."<br>";
 70             $aNewcont= new spider_cont($regArr[0]);
 71             $aNewcont->sohuCon($fenlei);
 72     }
 73 
 74     //Filter all html Label(data,In addition to which tags)
 75     function guolv($data, $chule) {
 76         return strip_tags($data, $chule);
 77     }
 78 
 79     //End
 80     function __destruct() {
 81         curl_close($this -> curl);
 82     }
 83 
 84 }
 85 
 86     $baiduList = '/<td[^>]*class="keyword">(.*?)<\/td>/si';
 87     //Baidu list filtering rules
 88     //$aNewList = new spider_list('http://top.baidu.com/buzz?b=42&c=513&fr=topbuzz_b341_c513');
 89     //$aNewList -> screen_list($baiduList, 239);
 90     
 91     $NewUrls  = array(
 92         array('aid'=>3021,'aurl'=>'http://Top. Baidu. COM / buzz? B = 344 & C = 513 & fr = top buzz? B42? C513 ','aname' = > 'entertainment'),//0
 93         array('aid'=>2585,'aurl'=>'http://Top. Baidu. COM / buzz? B = 341 & C = 513 & fr = top buzz ﹣ B1 ﹣ c513 ','aname' = > 'hot today'),//1
 94         array('aid'=>2585,'aurl'=>'http://Top. Baidu. COM / buzz? B = 1 & C = 513 & fr = top buzz? B344? C513 ','aname' = > 'hotspot'),//2
 95         array('aid'=>2585,'aurl'=>'http://Top. Baidu. COM / buzz? B = 42 & C = 513 & fr = top buzz? B341? C513 ','aname' = > 'hotspot'),//3
 96     );
 97 
 98     for ($i = 0; $i < count($NewUrls); $i++) {//Find all matching links
 99                 echo "<br>-----------classification-----------".$NewUrls[$i]['aname']."------------<br>";
100                 $aNewList = new spider_list($NewUrls[$i]['aurl']);
101                 $aNewList->screen_list($sohuList,$NewUrls[$i]['aid']);
102     }
103     //close database
104     mysqlJi::mysqlCl();
105 
106 ?>

Later, we added automatic synonym replacement in news. After finding the replacement, the news reading is too eye-catching and has been abandoned.

Tags: PHP curl Database SQL

Posted on Sun, 03 May 2020 06:30:08 -0700 by bigscanner