#!/usr/bin/perl ############################################################################## ## Perl script for capturing the essential details from every episode of ## specific seasons of "The Simpsons", from Wikipedia. This script will load ## each season page from Wikipedia, and parse out the relative details for ## each episode (title, writer, episode description, etc.) and write the ## result to a YAML file. This YAML file is useful for re-using these ## details from other programs. ## ## Requires the LWP suite, YAML::XS and the HTML::TableExtract perl modules, ## and Internet access to Wikipedia. ## ## By Andy Reitz , January 17th, 2009. ## URL: http://redefine.dyndns.org/~andyr/blog/archives/2009/02/the-simpsons-on-apple-tv.html ############################################################################## use strict; # For fetching URLs. use LWP::UserAgent; use HTTP::Cookies; use URI::Escape; use LWP::Debug qw(level); # For writing out the YAML file. use YAML::XS; # From: http://search.cpan.org/~msisk/HTML-TableExtract-2.10/lib/HTML/TableExtract.pm use HTML::TableExtract; # For argument parsing. use Getopt::Long; # Global variables. my ($DEBUG) = 0; my (@details) = ("Episode Number", "Episode Title", "Directed by", "Written by", "Original Airdate", "Episode ID"); my (%allEpisodesHash) = (); my ($episodeCounter) = 0; my ($urlBase) = 'http://en.wikipedia.org/wiki/The_Simpsons_(season_%d)'; # Set default seasons to grab. my ($optStartSeason) = 1; my ($optEndSeason) = 11; # Set default output filename. my ($optDatafile) = sprintf ("simpsons_metadb_seasons_%d-%d.yml", $optStartSeason, $optEndSeason); my ($optVerbose) = 0; my ($optHelp) = 0; # Parse command-line arguments if (!GetOptions ("datafile=s" => \$optDatafile, "startSeason=i" => \$optStartSeason, "endSeason=i" => \$optEndSeason, "verbose|v+" => \$optVerbose, "help|h|?" => \$optHelp, # Use --debug or -d, specify multiple times to increase # debugosity. "debug|d+" => \$DEBUG)) { &usage(); exit (3); } if ($optHelp) { &usage(); exit (3); } if ($DEBUG > 1) { # Enable LWP debug messages, but only if debugging is turned to a higher # level. level ('+debug'); } my ($ua) = LWP::UserAgent->new; my ($url) = ""; my ($s); for ($s = $optStartSeason; $s <= $optEndSeason; $s++) { # Write the current season # into the URL. $url = sprintf ($urlBase, $s); # Get it! my ($page) = &fetchPage($ua, $url); &DBG ("Page fetched from Wikipedia: $page") if ($DEBUG > 2); $episodeCounter = 0; # Has side-effects on the global %allEpisodesHash variable. &extractEpisodeTableFromPage($page, $s); &v ("Found $episodeCounter episodes for season $s."); # FIXME: Should add # episodes / season to the hash. } # Write the %allEpisodesHash out to a YAML file on disk. &dumpYAML($optDatafile); if ($DEBUG > 2) { use Data::Dumper; print "**** Dump of %allEpisodesHash: ****\n"; print Dumper \%allEpisodesHash; print "**** End dump. ****\n"; } # End script. exit (0); # Dump the %AllEpisdoesHash as YAML, to the specified file. sub dumpYAML { my ($wfn) = $_[0]; if (!open (YOUT, ">", $wfn)) { print "ERROR: Unable to open '$wfn' for writing!. Reason: $!\n"; exit (3); } &v ("Writing YAML db to file '$wfn'."); print YOUT Dump \%allEpisodesHash; if (!close (YOUT)) { print "ERROR: Unable to close yaml output file. Reason: $!\n"; exit (3); } } # End dumpYAML(). # This is where the magic happens. Using 'HTML::TableExtract', we try to # find the table on the Wikipedia page that contains all of the metadata # about the episodes of that season. Once this table has been identified, we # walk through each row, pick out the bits of metadata that we want to keep, # perform any necessary massaging, and insert a new entry into the # %allEpisodesHash hashtable. # # If Wikipedia ever changes their markup, this subroutine (and hence the # whole script) is likely to break. sub extractEpisodeTableFromPage { my ($epHash) = {}; # The entire HTML page. my ($html_string) = $_[0]; # For tracking what season we're on, which is a useful bit of data to attach to each episode. my ($currentSeason) = $_[1]; # Make a new TableExtract object. my ($te) = HTML::TableExtract->new( attribs => { class => "wikitable" } ); # Extract the table. $te->parse($html_string); my ($ts) = ""; foreach $ts ($te->tables) { &DBG ("Table with class='wikitable' found at ", join(',', $ts->coords)); my ($row) = ""; foreach $row ($ts->rows) { if ($row->[1]) { # This is a 6-colum row, which contains the descriptive bits about # the episode (#, writer, director, etc.) &DBG ("details row: " . join (',', @$row)); if ($row->[0] =~ /^#/) { # Skip the header row. &DBG ("skip header: " . $row->[0]); next; } my ($e) = ""; # row element my ($c) = 0; # column counter foreach $e (@$row) { # Strip any high-ascii characters. $e =~ s/[\x80-\xFF]//g; # Do some data fixin'. if ($details[$c] eq "Episode Title") { # Condense titles that span multiple lines down to one line. $e =~ s/\n\n/, /g; # Remove the double quotes around the title. This should # preserve double quotes that occur naturally within an episode # title. $e =~ s/^"(.*)"$/$1/; } elsif ($details[$c] eq "Episode Number") { # In wikipedia, this is '# - #', where the '-' is some unicode # bullshit. So, we just skip the characters (it's 3 bytes). # # The second number is the episode # for the season, which is # what we want. $e =~ s/^(\d+).*?(\d+)$/$2/; } elsif ($details[$c] eq "Written by") { # For some episodes (notably halloween episodes), the writers # will span multiple lines. Let's fix this up so that it looks # nice. $e =~ s/\n\n/, /g; } $epHash->{$details[$c]} = $e; $c++; } } else { # This is a description row - colspan is 6, and it contains the # episode's description. # Data fixin'. Remove the '[28]?' footnotes from description. The # '?' is actually some high-ascii byte. $row->[0] =~ s/\[\d+\].$//m; &DBG ("description row: " . join (',', $row->[0])); $epHash->{"Description"} = $row->[0]; $epHash->{"Season"} = $currentSeason; # Add the little episode hash that we have been building to the # master hash. if (!$allEpisodesHash{$epHash->{"Episode ID"}}) { $allEpisodesHash{$epHash->{"Episode ID"}} = $epHash; } else { # If we get here, then something is probably wrong with the data. print "******\nWARN: episode ID '" . $epHash->{"Episode ID"} . "' already exists in hash!\n******\n"; } $episodeCounter++; # Reset the single episode hash. $epHash = {}; } # row type } # foreach row } # foreach table } # End extractEpisodeTableFromPage(). # Use LWP to fetch a page from Wikipedia. Takes in a UserAgent object, and a # URL. Returns the text of the page as a scalar. sub fetchPage { my ($ua) = $_[0]; my ($url) = $_[1]; my ($req) = HTTP::Request->new (GET => $url); # Don't give me GZIP crapola. Wikipedia seems to ignore this in some # instances (particularily for Season 8). $req->header ('Accept-Encoding' => 'identity;q=1.0, gzip;q=0, compress;q=0'); $req->header ('TE' => 'gzip;q=0, deflate;q=0'); my ($res) = $ua->request ($req); if (!$res->is_success) { return ("", "ERROR: " . $res->status_line); } # This will apply any decodings (such as un-gzipping) the content returned # from the server. Once complete, this will return the full text of the # page, as a scalar. return ($res->decoded_content()); } # End fetchPage(). # Print debugging messages to STDERR, if enabled. sub DBG { my ($msg) = $_[0]; if ($DEBUG > 0) { chomp ($msg); print STDERR "DBG: $msg\n"; } } # End DBG(). # Print verbose messages to STDOUT, only if enabled. sub v { my ($msg) = $_[0]; if ($optVerbose > 0) { chomp ($msg); print "$msg\n"; } } # End v(). sub usage { print <