概述
/*
$content = file_get_contents($url);
$content = iconv("GBK","UTF-8",$content);
print_r($match);
*/
$url = "http://www.phpchina.com/";
$body = fopen_url($url);
preg_match_all('|
preg_match_all('|]*href=/"([^/"]+)/"/s*title="([^/"]+)"[^<>]*>([^<>]+)/a>|i', $match[1][0], $matches);
print_r($matches); //或match_links($match[1][0])
/**
*获取远程文件内容
*@param $url 文件http地址
*/
function fopen_url($url)
{
if (function_exists('file_get_contents')) {
$file_content = @file_get_contents($url);
} elseif (ini_get('allow_url_fopen') && ($file = @fopen($url, 'rb'))){
$i = 0;
while (!feof($file) && $i++ < 1000) {
$file_content .= strtolower(fread($file, 4096));
}
fclose($file);
} elseif (function_exists('curl_init')) {
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $url);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT,2);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER,1);
curl_setopt($curl_handle, CURLOPT_FAILONERROR,1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Trackback Spam Check'); //引用垃圾邮件检查
$file_content = curl_exec($curl_handle);
curl_close($curl_handle);
} else {
$file_content = '';
}
return $file_content;
}
/**
* 提取链接
*Array
(
[link] => Array
(
[0] => http://www.phpchina.com/?action-viewnews-itemid-37454
[1] => http://www.phpchina.com/?action-viewnews-itemid-37465
)
[content] => Array
(
[0] => 微软最顶级平台技术会议PDC10隆重
[1] => 解读微软PDC10要点 看云到端的战略...
)
[all] => Array
(
[0] => 微软最顶级平台技术会议PDC10隆重
[1] => ]+))[^>]*>?(.*?)'isx",$document,$links);
while(list($key,$val) = each($links[2])) {
if(!empty($val))
$match['link'][] = $val;
}
while(list($key,$val) = each($links[3])) {
if(!empty($val))
$match['link'][] = $val;
}
while(list($key,$val) = each($links[4])) {
if(!empty($val))
$match['content'][] = $val;
}
while(list($key,$val) = each($links[0])) {
if(!empty($val))
$match['all'][] = $val;
}
return $match;
}
// ####################### 获取文件流并转换成字符串 #######################
function openfile($url)
{
if(file($url)){
$str = file($url);
$count = count($str);
for ($i=0;$i
$file .= $str[$i];
}
return $file;
} else { die("文件打开失败!"); }
}
// ####################### 切分字符串 #######################
function cut($start,$end,$file){
$content=explode($start,$file);
$content=explode($end,$content[1]);
return $content[0];
}
// ####################### 清除垃圾代码 #######################
function del($start,$end,$content){
$del=cut($start,$end,$content);
$content=str_replace($del,"",$content);
$content=str_replace($start.$end,"",$content);
return $content;
}
// ####################### 分析域名 #######################
function getname($url)
{
$referer = preg_replace("/https?:([^//]+).*/i", "//1", $url);
$referer = str_replace("www.", "", $referer);
return $referer;
}
// ####################### 清除HTML代码table #######################
function clstable($content)
{
$clscontent= preg_replace("/
return $clscontent;
}
// ####################### 清除HTML代码script #######################
function clsscript($content)
{
$clscontent= preg_replace("/
]*?>.*?/script>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码div #######################
function clsdiv($content)
{
$clscontent= preg_replace("/
return $clscontent;
}
// ####################### 清除HTML代码iframe #######################
function clsifr($content)
{
$clscontent= preg_replace("/]*?>.*?/IFRAME>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码tr,td #######################
function clstrtd($content)
{
$clscontent= preg_replace("/
]*?>.*?/td>/si", "", $content);$clscontent= preg_replace("/
]*?>.*?/tr>/si", "", $clscontent);$clscontent= preg_replace("/
]*?>/si","",$clscontent);$clscontent= preg_replace("/
]*?>/si","",$clscontent);$clscontent= preg_replace("//tr>/si","",$clscontent);
$clscontent= preg_replace("//td>/si","",$clscontent);
return $clscontent;
}
// ####################### 清除HTML代码超链接 #######################
function clsa($content)
{
$clscontent= preg_replace("/]*?>.*?/a>/si", "", $content);
return $clscontent;
}
// ####################### 彻底清除所有HTML代码#######################
function clearhtml($content)
{
$search = array ("'
// -->'si", // 去掉 javascript
"']*?>'si", // 去掉 HTML 标记
"'([/r/n])[/s]+'", // 去掉空白字符
"'&(quot|#34);'i", // 替换 HTML 实体
"'&(amp|#38);'i",
"'&(lt|#60);'i",
"'&(gt|#62);'i",
"'&(nbsp|#160);'i",
"'&(iexcl|#161);'i",
"'&(cent|#162);'i",
"'&(pound|#163);'i",
"'&(copy|#169);'i",
"'(/d+);'e"); // 作为 PHP 代码运行
$replace = array ("",
"",
"//1",
"/"",
"&",
"
">",
" ",
chr(161),
chr(162),
chr(163),
chr(169),
"chr(//1)");
$text = preg_replace ($search, $replace, $content);
return $text;
}
// ####################### 写入缓存文件 #######################
function writetocache($cachedir,$cachename, $cachedata = '') {
$cachedir = './'.$cachedir.'/';
$cachefile = $cachedir.$cachename.'.php';
if(!is_dir($cachedir)) {
@mkdir($cachedir, 0777);
}
if(!is_dir($cachedir)) {
@mkdir($cachedir, 0777);
}
if(@$fp = fopen($cachefile, 'wb')) {
@fwrite($fp, $cachedata);
@fclose($fp);
@chmod($cachefile, 0777);
} else {
echo 'Can not write to cache files, please check directory ./cache/ .';
exit;
}
}
// ####################### 获取文件里的html链接 #######################
function geturl($re,$ufile,$rep1,$rep2){
preg_match_all ($re,$ufile,$out, PREG_PATTERN_ORDER);
$result=count($out[1]);
$i=0;
while($i
{
$outs[$i]=str_replace($rep1,$rep2,$out[1][$i]);
$i++;
}
//合并相同的链接并重新索引...
$reout=array();
$reout=resetar($outs);
return $reout;
}
// ####################### 切分文件流 #######################
function cut($start,$end,$file){
$content=explode($start,$file);
$content=explode($end,$content[1]);
return $content[0];
}
// ####################### 清除垃圾代码 #######################
function del($start,$end,$content){
$del=cut($start,$end,$content);
$content=str_replace($del,"",$content);
$content=str_replace($start.$end,"",$content);
return $content;
}
// ####################### 清除数组里的重复值并重新索引数组 #######################
function resetar($outs){
$reout=array();
$reouts=array();
$reout=array_unique($outs);
foreach($reout as $key=>$value){
array_push($reouts,$value);
}
return $reouts;
}
最后
以上就是怡然巨人为你收集整理的php 采集常用函数_PHP 采集常用函数整理的全部内容,希望文章能够帮你解决php 采集常用函数_PHP 采集常用函数整理所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复