#!/usr/local/bin/perl # Search for RDF files in Google, list those >= $minFileSize # based on getrdf.pl # added use of $queryTerm # excluding those that mention CDATA should omit RSS files that are # only big because they include complete stories in CDATA sections. # redif: Research Documents Information Format # try this sometime: http://www.google.com/search?num=50&q=%22large%20rdf%20file%22 if ($#ARGV < 0) { print "No query term provided as parameter. Exiting.\n"; exit(); } else { $queryTerm = pop(@ARGV); } my $query = "filetype:rdf $queryTerm -CDATA -redif -'cvs log for' "; #my $query = "filetype:n3 $queryTerm -CDATA "; my $minFileSize = 50; # Only list files bigger than this, in K. 2004-12-02 changed from 95 # 2005-03-15T15:25 changed from 75 because of n3 files #Number of times to loop,retrieving 10 results at a time my $loops = 99; #loops x 10 results per loop my $google_key='yourGoogleKeyHere'; # up to 1000 queries/day my $google_wdsl ="./GoogleSearch.wsdl"; use strict; use SOAP::Lite; my $google_search =SOAP::Lite->service("file:$google_wdsl"); #Keep track of result number my $number =0; print "<\!\-\- $query \-\->\n"; print "<\!\-\- Files bigger than $minFileSize K: \-\->\n"; for (my $offset =0;$offset <=($loops-1)*10;$offset +=10) { #Query Google # file:///C:/dev/googleapi/APIs_Reference.html#request_parameters: # key, query, 0-based index of 1st result, maxResults, filter, restrict, # safeSearch, language restrict, UTF-8, UTF-8 my $results =$google_search -> doGoogleSearch($google_key,$query,$offset ,10,"false","","false", "","UTF-8","UTF-8"); #No sense continuing unless there are more results last unless @{$results->{resultElements}}; #Loop through the results foreach my $result (@{$results->{'resultElements'}}){ if ($result->{cachedSize}) { my $fileSize = $result->{cachedSize}; my $URL = $result->{URL}; $fileSize =~ s/k//; if (($fileSize >= $minFileSize) && !($URL =~ /chatlogs.musicbrainz/) && !($URL =~ /gd.tuwien.ac.at\/opsys/) && !($URL =~ /trustix/i) && !($URL =~ /tinysofa/i) && !($URL =~ /danbri\/irclogs\/foaf/i) && !($URL =~ /london.pm.org\/\~jo\/foaf/) ) { # printf("%5d %s\n",$fileSize,$result->{URL}); # filesize usually 101 anyway print $result->{URL} . "\n"; } } } }