I'm just having a slight problem with indexing all pages. I think its a doctype problem:confused: :confused: :confused: , because it echoes back the whole site, but doesn't put all pages into in the database table. Decent code nonetheless. I'm planning on rating results based on if the term matches the title, if not the description if not the keywords, etc. Cool spider though, I just need a little final tweak. Can someone help me out please.
require_once('Connections/Connection.php');
mysql_select_db($database_Connection, $Connection);
//first opening of directory to extract filenames, keywords, and descriptions
$dh = opendir("C:\mywebfolder");
while (false !== ($filename = readdir($dh))) {
if ((preg_match('/htm/', $filename)) || (preg_match('/sun(.*)php/', $filename))){
$handle = fopen($filename, "rb");
$tags = get_meta_tags ($filename);
$description = $tags['description'];
$keywords = $tags['keywords'];
echo "$filename : $description : $keywords \n";
$buffer = fread($handle, filesize($filename));
//get the title
preg_match('#<title>(.*)</title>#isU', $buffer, $match);
$title = $match[1];
echo "$title \n";
//get the content
preg_match('#<body(.*)</body>#isU', $buffer, $bodymatch);
$bodytemp = $bodymatch[1];
//start of function to strip punctuation and tags out
$search = array ("'<script[^>]*?>.*?</script>'si", // Strip out javascript
"'<[\/\!]*?[^<>]*?>'si", // Strip out html tags
"'([\r\n])[\s]+'", // Strip out white space
"'&(quot|#34);'i", // Replace html entities
"'&(amp|#38);'i",
"'&(lt|#60);'i",
"'&(gt|#62);'i",
"'&(nbsp|#160);'i",
"'&(iexcl|#161);'i",
"'&(cent|#162);'i",
"'&(pound|#163);'i",
"'&(copy|#169);'i",
"'&#(\d+);'e"); // evaluate as php
$replace = array ("",
"",
"\\1",
"\"",
"&",
"<",
">",
" ",
chr(161),
chr(162),
chr(163),
chr(169),
"chr(\\1)");
$body1 = preg_replace ($search, $replace, $bodytemp);
$body3 = str_replace('/', '', $body1);
//end of function
$body = strip_tags($body3);
$content = stripslashes($body);
echo "$content \n\n";
//$result = mysql_query ("SELECT page_id FROM fullsearch WHERE page_url = $filename");
//$row = mysql_fetch_assoc($result);
// if (mysql_num_rows($row) == "1"){
// $sqlquery = "UPDATE fullsearch set page_title = '$title', page_description = '$description', page_keywords = '$keywords', page_content = '$content'
// WHERE page_url = '$filename' ";
// $results = mysql_query($sqlquery);
//}
//else{
$sqlquery = "INSERT INTO fullsearch (page_id, page_url, page_title, page_description, page_keywords, page_content)
VALUES ('','$filename', '$title', '$description','$keywords', '$content')";
$results = mysql_query($sqlquery);
//}
}
else{
}
}