#!/usr/bin/perl ###################################################################### # snowclone.pl, version 1.05 # # This is a program for finding variants of snowclones on the web # and counting up the number of times each variant is used. It # requires that the URL-fetching program "wget" is installed and # in the PATH. Results are printed to stdout, with progress reports # printed to stderr. # # Usage: snowclone.pl [exclude] # Examples: snowclone.pl "Eskimos have X words for snow" # snowclone.pl "X have Y words for Z" "Eskimos" # snowclone.pl "all your * are belong to us" # # - The mandatory argument is the phrasal template, with # slots to be filled indicated by the upper-case letters X, Y, or # Z, or by the wildcard character *. So, for example: # X have Y words for Z # * have * words for * # (Note that the particular letter used doesn't matter -- that is, # this program doesn't require that matches for a pattern like # "to X or not to X" have the same value for X. # - The optional argument [exclude] is a list of words to exclude # from the search. This can be useful if a small number of # variants of the pattern are overwhelmingly more common than # the rest, causing the less common variants not to occur in the # top 1000 (or so) search results. # # Feel free to do whatever you want with this code. If you use it, # I'd appreciate (but don't insist on) credit or a link: # http://tenser.typepad.com/tenser_said_the_tensor/ # # Version history: # # 1.01 # At Mark Liberman's suggestion, the results are now printed out # with the count before the variant instead of the other way # around. Also, added a couple of examples to the documentation # above. # 1.02 # Replace a couple more HTML entities that have cropped up in and # around snowclones: & and " # 1.03 # Added the searchDelay variable, since Google added a captcha to # validate repeated rapid searches. # 1.04 # Fixed a bug where the variable $res in the search subroutine was # overwriting the variable $res in the find-variants loop, causing # only the last 100 (901-1000) variants to be searched. Also, # increased the count-variants delay to 2 seconds. # 1.05 # Added a check in google_count() to see if the search engine has # unquoted our search string for us. If so, return a count of # zero. ###################################################################### # Globals -- tweak these to get different behavior # The number of search results to examine for variants $searchLimit = 1000; # The number of seconds to delay between wildcard searches to find # variants, and between non-wildcard searches to count variants $searchDelay = 30; $countDelay = 10; # The minimum number of hits a variant must receive to be output $minimumCount = 1; # The search engines to use: one for finding variants of the snowclone # and another (possibly the same as the first) for counting # occurrences of those variants. # # A search engine has to support wildcard searches (e.g. "holy * # batman") in order to be useful for finding variants (except in the # subset of snowclones where the slot being filled is on an edge, # e.g. "X delenda est"). Only Google and Yahoo seem to support this # feature, so the available choices for finding variants are: # google # google groups # google blogsearch # yahoo # # Note that while Yahoo is included above, it's not really practical # to use it for finding variants -- the excerpts it provides for # wildcard searches very often don't include the text that matched the # query, which greatly reduces the number of variants found. What's # more, Yahoo detects the sort of repeated searching this script # performs and blocks the originating site. Nice. # # In order to serve for counting variants, all a search engine has to # support is quoted-string searches. This is much more common than # wildcard searches, but still not universal -- Ask.com, for example, # doesn't support them. The available choices for counting variants # are: # google # google groups # google blogsearch # yahoo # live # gigablast # exalead (warning: SLOW) $variantSearchEngine = "google"; $countSearchEngine = "google"; ###################################################################### # Subroutines for fetching URLs from the web, searching for patterns # on either Google or Yahoo, and for counting occurrences of patters # on Google or Yahoo. At the moment, there's one _search routine and # one _count routine for each search engine supported. These could # be factored further, but at some penalty to readability. # Fetch the contents at a URL from the web, and return them. sub get_url { my $url = shift; # options for wget, including a user-agent that mimics Firefox 2.0 my $wg = "wget -q -O - -U \"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0\""; return `$wg "$url"`; } ###################################################################### # Search subroutines # Search Google for the URL-encoded pat, return num results, start at start sub google_search { my $pat = shift; my $num = shift; my $start = shift; my $url = "http://www.google.com/search?q=${pat}&num=${num}"; if ($start > 1) { $url .= "&start=${start}"; } $res = get_url($url); if ($res =~ /

... but your query looks similar/) { print STDERR "\nCaptcha!\n"; } return $res; } # Search Google Groups for the URL-encoded pat, return num results, # start at start sub google_groups_search { my $pat = shift; my $num = shift; my $start = shift; my $url = "http://groups.google.com/groups?q=${pat}&num=${num}"; if ($start > 1) { $url .= "&start=${start}"; } return get_url($url); } # Search Google Blog Search for the URL-encoded pat, return num results, # start at start sub google_blogsearch_search { my $pat = shift; my $num = shift; my $start = shift; my $url = "http://blogsearch.google.com/blogsearch?q=${pat}&num=${num}"; if ($start > 1) { $url .= "&start=${start}"; } return get_url($url); } # Search Yahoo for the URL-encoded pat, return num results, start at start sub yahoo_search { my $pat = shift; my $num = shift; my $start = shift; my $url = "http://search.yahoo.com/search?p=${pat}&n=${num}"; if ($start > 1) { $url .= "&b=${start}"; } sleep(5); # Go slow to avoid hammering the site and getting blocked return get_url($url); } # Search Live Search for the URL-encoded pat, start at start. Note that # num isn't used because Live doesn't support it as part of the search # URL. sub live_search { my $pat = shift; my $num = shift; my $start = shift; my $url = "http://search.live.com/results.aspx?q=${pat}"; if ($start > 1) { $url .= "&first=${start}"; } return get_url($url); } # Search Gigablast for the URL-encoded pat, return num results, # start at start sub gigablast_search { my $pat = shift; my $num = shift; my $start = shift; my $url = "http://gigablast.com/search?q=${pat}&n=${num}"; if ($start > 1) { $url .= "&s=${start}"; } return get_url($url); } # Search Exalead for the URL-encoded pat, start at start. Note that # num isn't used because Exalead doesn't support it as part of the search # URL. sub exalead_search { my $pat = shift; my $num = shift; my $start = shift; my $url = "http://www.exalead.com/search?q=${pat}"; if ($start > 1) { $url .= "&b=${start}"; } return get_url($url); } # Search using the selected search engine sub web_search { my $pat = shift; my $num = shift; my $start = shift; if ($start + $num > 1000) { $num = 1000 - $start; } if ($searchDelay > 0) { sleep($searchDelay); } if ($variantSearchEngine eq "google") { return google_search($pat, $num, $start); } elsif ($variantSearchEngine eq "google groups") { return google_groups_search($pat, $num, $start); } elsif ($variantSearchEngine eq "google blogsearch") { return google_blogsearch_search($pat, $num, $start); } elsif ($variantSearchEngine eq "yahoo") { return yahoo_search($pat, $num, $start); } } ###################################################################### # Count subroutines # Use Google to count the number of occurences of pat sub google_count { my $pat = shift; # only ask for one result, since we're really after the count my $res = google_search($pat, 1, 0); # check if Google has unhelpfully turned out quoted search unquoted if ($res =~ /No results found for /) { return "0"; } # extract the count $res =~ /of (about )?([^<]+)<\/b>/; return $2; } # Use Google Groups to count the number of occurences of pat sub google_groups_count { my $pat = shift; # only ask for one result, since we're really after the count my $res = google_groups_search($pat, 1, 0); # extract the count $res =~ /of (about )?([^<]+)<\/b>/; return $2; } # Use Google Blog Search to count the number of occurences of pat sub google_blogsearch_count { my $pat = shift; # only ask for one result, since we're really after the count my $res = google_blogsearch_search($pat, 1, 0); # extract the count $res =~ /of (about )?([0-9,]+)/; return $2; } # Use Yahoo to count the number of occurences of pat sub yahoo_count { my $pat = shift; # only ask for one result, since we're really after the count my $res = yahoo_search($pat, 1, 0); # extract the count $res =~ /[0-9]+ - [0-9]+ of about ([0-9,]+)/; return $1; } # Use Live Search to count the number of occurences of pat sub live_count { my $pat = shift; # only ask for one result, since we're really after the count my $res = live_search($pat, 1, 0); # extract the count $res =~ /Page [0-9]+ of ([0-9,]+) results/; return $1; } # Use Gigablast to count the number of occurences of pat sub gigablast_count { my $pat = shift; # only ask for one result, since we're really after the count my $res = gigablast_search($pat, 1, 0); # extract the count $res =~ /Results [0-9]+<\/b> to [0-9]+<\/b> of about ([0-9,]+)<\/b>/; return $1; } # Use Exalead to count the number of occurences of pat sub exalead_count { my $pat = shift; # only ask for one result, since we're really after the count my $res = exalead_search($pat, 1, 0); # extract the count $res =~ /Results [0-9]+-[0-9]+<\/b> of about ([0-9,]+)<\/b>/; return $1; } # Count using the selected search engine sub web_count { my $pat = shift; my $count; if ($countDelay > 0) { sleep($countDelay); } if ($countSearchEngine eq "google") { $count = google_count($pat); } elsif ($countSearchEngine eq "google groups") { $count = google_groups_count($pat); } elsif ($countSearchEngine eq "google blogsearch") { $count = google_blogsearch_count($pat); } elsif ($countSearchEngine eq "yahoo") { $count = yahoo_count($pat); } elsif ($countSearchEngine eq "live") { $count = live_count($pat); } elsif ($countSearchEngine eq "gigablast") { $count = gigablast_count($pat); } elsif ($countSearchEngine eq "exalead") { $count = exalead_count($pat); } $count =~ s/,//g; return $count; } ###################################################################### # The main program begins here if ($#ARGV < 0 || $#ARGV > 1) { print "Usage: snowclone.pl [exclude]\n"; exit(0); } $pattern = $ARGV[0]; $exclude = $ARGV[1]; # replace * with X in the pattern since X is easier to deal with $pattern =~ s/(^| )\*( |$)/\1X\2/g; # create the pattern in the format used by in search engines URLs: # replace the word X, Y, or Z with * # replace each space with + $searchPat = $pattern; $searchPat =~ s/\b[XYZ]\b/\*/g; $searchPat =~ s/ /\+/g; $searchPat = "%22" . $searchPat . "%22"; # if we're excluding words, add them to the search pattern with "-" if ($exclude ne "") { for $ex (split / /, $exclude) { $searchPat .= "+-" . $ex; } } # create the regular expression pattern used to extract variants: # replace the word X, Y, or Z with a regex that matches one or more words # surround the whole thing with parentheses $regexPat = $pattern; $regexPat =~ s/\b[XYZ]\b/([a-z0-9]|(\\b ?\\b)|(\\w[-']\\w))+/g; $regexPat = "(" . $regexPat . ")"; # change to if (1) to debug the patterns if (0) { print STDERR "Search engine pattern: >$searchPat<\n"; print STDERR "Regex pattern: >$regexPat<\n"; } # get 100 search results at a time until we have $searchLimit results $search_res = ""; for ($i = 1; $i < $searchLimit; $i += 100) { print STDERR "Finding variants: " . int($i / 100 + 1) . " / " . ($searchLimit / 100) . " \r"; $search_res .= web_search($searchPat, 100, $i); $search_res .= "\n\n"; } # change to if (1) to dump the search results if (0) { print $search_res; } # clean up the concatenated search results a bit: # strip out simple HTML tags like # strip out punctuation commas # replace the following HTML entities: # ' with ' # " with " # & with & $search_res =~ s/<\/?[^ >]+>//g; $search_res =~ s/, / /g; $search_res =~ s/'/'/g; $search_res =~ s/"/"/g; $search_res =~ s/&/&/g; # search for the pattern, getting an array of all the matches @matches = ($search_res =~ /$regexPat/ig); # since the pattern contains nested parentheses, @matches may contain # spurious empty, 1, 2, or 3-character matches; remove those @matches = grep(/..../, @matches); # create a list of the variants found, filtering a bit more: # turn dashes into spaces # replace multiple spaces with one space # strip spaces at the beginning and end for $m (@matches) { $m =~ s/-/ /g; $m =~ s/ +/ /g; $m =~ s/(^ +)|( +$)//g; $m = lc($m); if ($m ne "") { $variants[$varCount++] = $m; } } # this cryptic business removes duplicates in @variants %seen = (); @variants = grep { ! $seen{$_} ++ } @variants; # change to if (1) to write out the variants to a file before continuing if (0) { open(VARS, '>variants'); for (@variants) { print VARS; print VARS "\n"; } close(VARS); } if ($#variants > 1) { print STDERR "\n"; } # pass through the @variants and do a search on each, extracting the # (approximate) count for ($i = 0; $i <= $#variants; $i++) { print STDERR "Counting variants: " . ($i + 1) . " / " . ($#variants + 1) . " \r"; $v = $variants[$i]; $varPat = $v; $varPat =~ s/ /\+/g; $varPat = "%22" . $varPat . "%22"; $count = web_count($varPat); if ($count >= $minimumCount) { $counts{$v} = $count; } } @sorted = sort { $counts{$b} <=> $counts{$a} } keys %counts; # An attempt to clean up the results a bit: if a less specific # (i.e. shorter) variant was found, remove any of the more # specific (i.e. longer) variants whose counts are lower # than some fraction of the less specific version's count. # # This is dangerous because it can be too aggressive, especially # if the fraction is too small, say 1.0. If we're searching # for "where have you gone X" and happen to get the variant # "where have you gone Joe" (because of a truncated excerpt), it # will presumably have a slightly higher count than "where have you # gone Joe DiMaggio", and so the fuller, more correct answer would # be removed. if (0) { $fraction = 3.0; for ($i = 0; $i <= $#sorted - 1; $i++) { for ($j = $i + 1; $j <= $#sorted; $j++) { $hi = $sorted[$i]; $lo = $sorted[$j]; if ($hi ne "" && $lo =~ /$hi/ && $counts{$hi} > $fraction * $counts{$lo}) { $sorted[$j] = ""; } } } } $finalCount = 0; # finally (finally!) print out the results for $k (@sorted) { if ($k ne "") { $finalCount++; print "$counts{$k}\t$k\n"; } } print STDERR "\nWrote $finalCount variants\n";