Вот
в файле doors.txt должны находится урлы.
Скрипт чекает на живость страницы сначала, признак наличие урла проверяемой страницы в тексте. Если жива то тянет метатэги.
На выходе:
doors_checked.txt - все проверяемые паги
doors_checked_alive.txt - тока живые тайтлами, десками итд.
Так же сть немного настроек смотри внутри.
Код:
<?php
set_time_limit(0);
error_reporting("E_ALL");
/**
SOME CONFIGURATION
*/
/**
Формат в котором корямтся блоги
Format of the input file
1 - http://verizonwireless.vivopress.com/
2 - login:pass:verizonwireless.vivopress.com/
*/
$format = 1;
/**
Clear or not out files before check
0 - no
1 - yes
*/
$clear_out_files = 1;
/**
Get title, description and keywords tags content or not
0 - no
1 - yes, leave empty if so
2 - yes and if no description or keywords than buld from title,
i.e. title = "some cool title" and there is no description or/and keywords then
decription = "some cool title, som, cool, title"
keywords = "some cool title, som, cool, title"
*/
$get_info = 1;
//Input file
$file_in = './doors.txt';
//Output file for all checked sites
$file_all = "./doors_checked.txt";
//Output file for only checked alive sites
$file_alive = "./doors_checked_alive.txt";
//Patterns for description and keywords meta tags
$pattern_title = "|<title>(.*)</title>|is";
$pattern_desc = "|<meta name=\"description\" content=\"(.*)\"|i";
$pattern_keys = "|<meta name=\"keywords\" content=\"(.*)\"|i";
/**
SCRIPT BEGIN
*/
/**
Returns title explodede by words with separator you set
*/
function explode_title($title, $separator = ", "){
$arr = explode(" ", $title);
$j = count($arr);
$i = 1;
foreach ($arr as $k){
if ($i == $j){
$str .= $k;
} else {
$str .= $k . $separator;
}
$i++;
}
return $str;
}
function ClearString($str){
$str = str_replace("»", "", $str); //remove >>
$str = str_replace("|", "", $str); //remove |
$str = str_replace(">", "", $str); //remove >
$str = str_replace("< ", "", $str); //remove <
$str = preg_replace("/^\s+/", '', $str); // remove leading blanks
$str = preg_replace("/\s+/", ' ', $str); //convert any whitespace to blanks
$str = preg_replace("/ +/", ' ', $str); // eliminate any multiple blanks...
$str = preg_replace("/&+/", ' ', $str); // Убирает все символы &
$str = preg_replace("|\.\.\.|", ".", $str); // Убирает все символы ...
$str = preg_replace("|\(|", "", $str); // Убирает все символы (
$str = preg_replace("|\)|", "", $str); // Убирает все символы )
$str = preg_replace("|\[.*\]|", "", $str); // Убирает все символы []
return $str;
}
//Check format and get urls
switch ($format){
case 1:
$urls = file($file_in);
$strings = file($file_in);
break;
case 2:
$strings = file($file_in);
foreach ($strings as $str){
$s = explode(":", $str);
$urls[] = "http://".$s[2];
}
break;
}
//Check clear or not files an do or not need operation
if ($clear_out_files){
$outFile = fopen($file_all, "w");
$outFile2 = fopen($file_alive, "w");
fputs($outFile, "\"URL\";\"STATUS\"\r\n");
fputs($outFile2, "\"URL\";\"TITLE\";\"KEYWORDS\";\"DESCRIPTION\"\r\n");
fclose($outFile);
fclose($outFile2);
}
//the process
$i = 0;
foreach ($urls as $url) {
$description = "";
$keywords = "";
$title = "";
$url = trim($url);
if ($content = file_get_contents(trim($url))){
//To be shure that url without "/" at the end, i.e. not http://someurl.com/
$url1 = substr($url, 0, strlen($url)-1);
/**
The check
If we can find text that equal to the checking url in the text then alive else dead
It works in 97,254%
*/
if ((stripos($content, $url1) == false))
{
$outFile = fopen($file_all, "a");
fputs($outFile, "\"".$url."\";\"dead\"\r\n");
fclose($outFile);
} else {
//Get title not always
if (preg_match($pattern_title, $content, $matches)){
//$title = ClearString(trim($matches[1]));
$title = trim($matches[1]);
}
//Clear title
switch ($get_info){
case 0:
break;
case 1:
if (preg_match($pattern_desc, $content, $matches)){
$description = $matches[1];
}
if (preg_match($pattern_keys, $content, $matches)){
$keywords = $matches[1];
}
break;
case 2:
if (preg_match($pattern_desc, $content, $matches) and (trim($matches[1])!="")){
$description = $matches[1];
} else {
$description = $title.", ".explode_title($title);
}
if (preg_match($pattern_keys, $content, $matches) and (trim($matches[1])!="")){
$keywords = $matches[1];
} else {
$keywords = $title.", ".explode_title($title);
}
break;
}
$outFile = fopen($file_all, "a");
$outFile2 = fopen($file_alive, "a");
fputs($outFile, "\"".$url."\";\"alive\"\r\n");
//fputs($outFile2, "\"".$url."\";\"".$title."\";\"".$keywords."\";\"".$description."\"\r\n");
fputs($outFile2, "\"".trim($strings[$i])."\";\"".$title."\";\"".$keywords."\";\"".$description."\"\r\n");
fclose($outFile);
fclose($outFile2);
}
} else {
$outFile = fopen($file_all, "a");
fputs($outFile, "\"".$url."\";\"dead\"\r\n");
fclose($outFile);
}
$i++;
}
?>