返回列表 回复 发帖

phpchina新闻小偷聚合器

<?

/*

phpchina聚合器

聚合phpchina的文章,并且每次运行不添加重复数据。

在 cdb_threads 表里面增加两个字段

tinyint(1):isCollection
char(255):collectionURL

适用于 Discuz!5.0 版本,当然稍微改动一下可以放到任何系统里。

http://www.phpmy.net,langwan<langwanluo@gmail.com>

*/

set_time_limit(0);

$pagePasswd = 'http://www.zybbs.net'; //自己改

if($pagePasswd != $_GET['pagePasswd']) {
exit('拒绝访问');
}

$userID = 1;
$userName = 'admin';
$fid = 6;

$dbHost = 'localhost';
$dbName = 'dbName'; //自己改
$dbUser = 'dbUser'; //自己改
$dbPass = 'dbPass'; //自己改

mysql_connect($dbHost, $dbUser, $dbPass);
mysql_select_db($dbName);
mysql_query("SET character_set_connection=gbk, character_set_results=gbk, character_set_client=binary");
mysql_query("SET sql_mode=''");

$blockStart = '<div id="newslist">';
$blockEnd = '<!--新闻(标题)-->';

$pageStart = '<td class="xspace-current">';
$pageEnd = '&gt;</a></td></tr></table>';

$TitleStart = '<div id="net">';
$TitleEnd = '</div>';

$TimeStart = '发布时间: ';
$AuthorStart = '  作者: ';
$sourceStart = '   信息来源: ';
$sourceEnd = '</div>';
$contentStart = '<div id="nec">';
$contentEnd = '<center><input type="image" onclick=copyToClipBoard()';

$urlArray = array(
'http://www.phpchina.com/category_71.html',
'http://www.phpchina.com/category_69.html',
'http://www.phpchina.com/category_72.html',
'http://www.phpchina.com/category_81.html'
);

foreach($urlArray as $url) {

$pageArray = $subjectArray = $contentArray = array();

$sql = "SELECT * FROM cdb_threads WHERE isCollection = '1' AND collectionURL = '$url' ORDER BY dateline DESC";
$res = mysql_query($sql);
$row = mysql_fetch_array($res);

$data = file_get_contents($url);
$page = getBlock($data, $pageStart, $pageEnd);
$pageArray = getPage($page);
$pageArray[0] = $url;

foreach($pageArray as $page) {
  
   $data = file_get_contents($page);
   $list = getBlock($data, $blockStart, $blockEnd);
  
   $subjectArray = array_merge($subjectArray, getSubject($list, $stopSubject));

   if($subjectArray[count($subjectArray) - 1]['link'] == '') {
    break;
   }
  
}

foreach($subjectArray as $v) {
   if($v['link'] != '') {
    $content = getContent($v);
    importBBS($content);
   }
}

}


function importBBS($content) {
$content['content'] = addslashes($content['content']);
global $fid, $userID, $userName;

$sql = "INSERT INTO cdb_threads(
   fid, author, subject,
   lastpost, lastposter,
   authorid, dateline, isCollection, collectionURL
) VALUES (
   '$fid', '$userName', '{$content['subject']}',
   '{$content['createTime']}', '$userName',
   '$userID', '{$content['createTime']}', '1', '{$content['url']}'
)";


mysql_query($sql);
echo mysql_error();

$tid = mysql_insert_id();

$sql = "INSERT INTO cdb_posts(
   fid, tid, subject,
   first, author, authorid,
   dateline, message, htmlon, bbcodeoff, smileyoff
) VALUES (
   '$fid', '$tid', '{$content['subject']}',
   '1', '$userName', '$userID',
   '{$content['createTime']}', '{$content['content']}', '1', 1, 1
)";

mysql_query($sql);
echo mysql_error();

$sql = "UPDATE cdb_forums SET threads=threads+1, posts=posts+1 WHERE fid='$fid'";
mysql_query($sql);

}

function getContent($url) {
global $TimeStart, $AuthorStart, $sourceStart, $sourceEnd, $contentStart, $contentEnd;
$ret = $array = array();
$data = @file_get_contents($url['link']);

if(!empty($data)) {
   $explode = explode($TimeStart, $data);
   $explode = explode($AuthorStart, $explode[1]);
  
   $ret['createTime'] = strtotime($explode[0]);
  
  
  
   $explode = explode($sourceStart, $explode[1]);
   $ret['author'] = $explode[0];
  
   $explode = explode($sourceEnd, $explode[1]);
   $ret['source'] = $explode[0];
  
   $explode = explode($contentStart, $explode[1]);
   $explode = explode($contentEnd, $explode[1]);
   $ret['content'] = $explode[0];
  
   $ret['subject'] = $url['subject'];
   $ret['url'] = $url['url'];
}
return $ret;


}

function getBlock($data, $start, $end) {
$explode = explode($start, $data);
$explode = explode($end, $explode[1]);
return $explode[0];
}

function getPage($data) {
$linkArray = $ret = Array();
$pattern = '/<a href=\"([^ ]+)\" target="_self">([^<>]+)<\/a>/ies';
preg_match_all($pattern, $data, $linkArray, PREG_SET_ORDER);

$i = 1;
foreach($linkArray as $v) {
   $ret[$i++] = $v[1];
}
return $ret;
}

function getSubject($data, $stopSubject) {
$linkArray = Array();
$pattern = '/<a href=\"([^ ]+)\" target="_blank" class="link12">([^<>]+)<\/a>/ies';
preg_match_all($pattern, $data, $linkArray, PREG_SET_ORDER);

$i = 0;
foreach($linkArray as $v) {
  
   if($stopSubject['subject'] == $v[2]) {
   
    $ret[$i]['link'] = '';
    $ret[$i]['subject'] = '';
    return $ret;
   
   }
  
   $ret[$i]['link'] = $v[1];
   $ret[$i]['subject'] = $v[2];
   $ret[$i++]['url'] = $stopSubject['url'];
  
}
return $ret;
}

?>
返回列表