]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>"; $URLs=array(); if(preg_match_all("/$regexp/siU", $input, $matches, PREG_SET_ORDER)) { foreach($matches as $match) { //echo($match[2]."\t"); $currentURL=parse_url_domain ($match[2]); //echo($currentURL); //echo("\n"); if ($currentURL!=$webpage && !in_array($currentURL,$URLs)&&check_domain($currentURL)&&check_tld($currentURL)) { $URLs[]=$currentURL; if (in_array($currentURL,$acceptableNodes)&& ($strippedWebpage!=$currentURL)) { $connections[]="$strippedWebpage -> $currentURL"; $nodeCounter[$currentURL]++; } } } } echo("$webpage : \n"); print_r($URLs); } echo("connections: "); print_r($connections); echo("nodeCounter: "); print_r($nodeCounter); echo("javascript edges: "); $multiplier=3; foreach($connections as $connection) { list($from,$to)=explode(" -> ",$connection); $fromCtr=3*$nodeCounter[$from]; $toCtr=3*$nodeCounter[$to]; echo("g.addEdge($('".$from."'), $('".$to."'),".$fromCtr.",".$toCtr.");\n"); } function parse_url_domain ($url) { $raw_url= parse_url($url); if ($raw_url['host'] == '') { $raw_url['host'] = $raw_url['path']; } $domain_only[1] = $raw_url['host']; return strtolower($domain_only[1]); } function check_domain ($url) { if (!ereg("^.*\..*$", $url)) { return false; } $local_array = explode(".", $url); for ($i = 0; $i < sizeof($local_array); $i++) { if (!ereg("^(([A-Za-z0-9!#$%&'*+/=?^_`{|}~-][A-Za-z0-9!#$%&'*+/=?^_`{|}~\.-]{0,63})|(\"[^(\\|\")]{0,62}\"))$", $local_array[$i])) { return false; } } return true; } function check_tld($url) { global $tldList; $parts=explode(".",$url); $lastpart=trim($parts[count($parts)-1]); foreach($tldList as $item) { if (trim($item)==$lastpart) return true; } return false; } ?>