#!/usr/bin/perl ############################################################################## ## Perl script for capturing the essential details from every episode of the ## TV show "Angel", from Wikipedia. This script will load the list of Angel ## Episodes from Wikipedia, and then drill down into the specific page for ## each episode. From this page, it will parse out the relative details for ## each episode (title, writer, episode description, etc.) and write the ## result to a YAML file. This YAML file is useful for re-using these ## details from other programs. ## ## Requires the LWP suite, YAML::XS and the HTML::TableExtract perl modules, ## and Internet access to Wikipedia. ## ## By Andy Reitz , September 7th, 2009. ## URL: http://redefine.dyndns.org/~andyr/blog/archives/2009/09/angel-in-my-itunes.html ############################################################################## use strict; # For fetching URLs. use LWP::UserAgent; use HTTP::Cookies; use URI::Escape; use LWP::Debug qw(level); # For writing out the YAML file. use YAML::XS; # From: http://search.cpan.org/~msisk/HTML-TableExtract-2.10/lib/HTML/TableExtract.pm use HTML::TableExtract; # For argument parsing. use Getopt::Long; # For debugging. use Data::Dumper; # Global variables. my ($DEBUG) = 0; my (@details) = ("Episode Number", "Episode Title", "Directed by", "Written by", "Original Airdate", "Episode ID"); my (%allEpisodesHash) = (); my (%descriptions) = (); my ($episodeCounter) = 0; my ($urlBase) = 'http://en.wikipedia.org/wiki/List_of_Angel_episodes'; # List of relative URLs, parsed from the main 'List of Angel Episodes' page. my (@episodeURLs) = (); # Set default output filename. my ($optDatafile) = "angel_metadb_seasons_1-5.yml"; my ($optVerbose) = 0; my ($optHelp) = 0; # Parse command-line arguments if (!GetOptions ("datafile=s" => \$optDatafile, "verbose|v+" => \$optVerbose, "help|h|?" => \$optHelp, # Use --debug or -d, specify multiple times to increase # debugosity. "debug|d+" => \$DEBUG)) { &usage(); exit (3); } if ($optHelp) { &usage(); exit (3); } if ($DEBUG > 1) { # Enable LWP debug messages, but only if debugging is turned to a higher # level. level ('+debug'); } my ($ua) = LWP::UserAgent->new; # Get it! &DBG ("Fetching URL: '$urlBase'"); my ($page) = &fetchPage($ua, $urlBase); &DBG ("Page fetched from Wikipedia: $page") if ($DEBUG > 2); $episodeCounter = 0; &extractEpisodeURLs($page); my ($x); for ($x = 0; $x < scalar(@episodeURLs); $x++) { if ($episodeURLs[$x] =~ m|^/wiki/Corrupt|) { # Skip the unproduced episode. next; } &extractDetailsFromEpisodePage ($episodeURLs[$x]); } &v ("Found $episodeCounter episodes."); # FIXME: Should add # episodes / season to the hash. # Write the %allEpisodesHash out to a YAML file on disk. &dumpYAML($optDatafile); if ($DEBUG > 2) { print "**** Dump of %allEpisodesHash: ****\n"; print Dumper \%allEpisodesHash; print "**** End dump. ****\n"; } # End script. exit (0); # Dump the %AllEpisdoesHash as YAML, to the specified file. sub dumpYAML { my ($wfn) = $_[0]; if (!open (YOUT, ">", $wfn)) { print "ERROR: Unable to open '$wfn' for writing!. Reason: $!\n"; exit (3); } &v ("Writing YAML db to file '$wfn'."); print YOUT Dump \%allEpisodesHash; if (!close (YOUT)) { print "ERROR: Unable to close yaml output file. Reason: $!\n"; exit (3); } } # End dumpYAML(). ## ## Harvests URLs of each Angel episode from the "List of Angel Episodes" ## page. ## ## Additionally, episode descriptions are harvested from this page, and ## stored in the global $descriptions hash reference. ## sub extractEpisodeURLs { # The entire HTML page. my ($html_string) = $_[0]; # Make a new TableExtract object. my ($te) = HTML::TableExtract->new( attribs => { class => "wikitable" }, keep_html => 1 ); # Extract the table. $te->parse($html_string); my ($ts) = ""; my ($prevEpUrl) = ""; foreach $ts ($te->tables) { &DBG ("extractEpisodeURLs(): Table with class='wikitable' found at ", join(',', $ts->coords)); my ($row) = ""; foreach $row ($ts->rows) { # details row if ($row->[1]) { my ($epUrl) = ""; if ($row->[2] =~ /a href="\/wiki\/([^"]+)"/) { $epUrl = "/wiki/" . $1; # &DBG ("URL: $epUrl"); push @episodeURLs, $epUrl; $prevEpUrl = $epUrl; } } else { # description row # &DBG("saving description for '$prevEpUrl':\n" . $row->[0] . "\n*****"); $descriptions{$prevEpUrl} = &cleanupHTML($row->[0]); } } } } # End extractEpisodeURLs(). ## ## Given a relative URL to an Angel episode page on Wikipedia, this routine ## will extract the relevant metadata from the info box. ## ## Data is stored in the global '$epHash' hash reference. ## sub extractDetailsFromEpisodePage { my ($page) = $_[0]; my ($url) = "http://en.wikipedia.org$page"; &DBG ("Fetching page '$url'"); # $ua is global. my ($episodePage) = &fetchPage($ua, $url); # Make a new TableExtract object. my ($te) = HTML::TableExtract->new( attribs => { class => "infobox vevent" }, decode => 0 ); # Extract the table. $te->parse($episodePage); my ($ts) = ""; my ($epHash) = {}; foreach $ts ($te->tables) { &DBG ("Table with class='infobox vevent' found at ", join(',', $ts->coords)); my ($row) = ""; my ($firstRow) = 1; foreach $row ($ts->rows) { &DBG ("infobox row: " . join ('=', @$row)); if ($firstRow) { $epHash->{"Episode Title"} = &cleanupHTML ($row->[0]); &v ("Parsed episode: " . $epHash->{"Episode Title"}); $firstRow = 0; next; } if ($row->[0] =~ /Written by/) { $epHash->{"Written by"} = &cleanupHTML ($row->[1]); &DBG ("Written by: " . $epHash->{"Written by"}); } elsif ($row->[0] =~ /Directed by/) { $epHash->{"Directed by"} = &cleanupHTML ($row->[1]); &DBG ("Directed by: " . $epHash->{"Directed by"}); } elsif ($row->[0] =~ /Production/) { $epHash->{"Episode ID"} = $row->[1]; &DBG ("Episode ID: " . $row->[1]); } elsif ($row->[0] =~ /Original.*airdate/) { $epHash->{"Original Airdate"} = $row->[1]; &DBG ("Original Airdate: " . $row->[1]); } elsif ($row->[0] =~ /^Episode/) { if ($row->[1] =~ /Season[&][#]160;(\d+).*Episode[&][#]160;(\d+)/ms) { $epHash->{"Season"} = $1; &DBG ("Season: $1"); $epHash->{"Episode Number"} = $2; &DBG ("Episode #: $2"); } } } } &DBG ("Setting description for page '$page':\n" . $descriptions{$page}); $epHash->{"Description"} = $descriptions{$page}; # Add the little episode hash that we have been building to the # master hash. if (!$allEpisodesHash{$epHash->{"Episode ID"}}) { $allEpisodesHash{$epHash->{"Episode ID"}} = $epHash; } else { # If we get here, then something is probably wrong with the data. print "******\nWARN: episode ID '" . $epHash->{"Episode ID"} . "' already exists in hash!\n******\n"; } $episodeCounter++; # Reset the single episode hash. $epHash = {}; &DBG (" --- *** ---\n\n"); } # End extractDetailsFromEpisodePage(). ## ## A series of regex hacks to cleanup the data that we get from Wikipedia. I ## probably should do something a lot more robust here. ## sub cleanupHTML() { my ($inp) = $_[0]; # Convert HTML ampersand entities. $inp =~ s/&/&/g; # Don't know why Wikipedia is inserting these elements, but the browser # renders it as a space. $inp =~ s/ / /g; # Collapse multiple lines down to one. $inp =~ s/\n/ /g; if ($inp !~ /[(]/) { # Text doesn't have any open paren's in it, so let's collapse double # spaces (and more) down to a comma-separated list. $inp =~ s/\s\s+/, /g; } else { # Collapse spaces, and put comma's after closing parens. $inp =~ s/\s\s+/ /g; $inp =~ s/[)] /), /g; } # Strip any high-ascii characters. $inp =~ s/[\x80-\xFF]//g; # Remove the double quotes around the title. This should # preserve double quotes that occur naturally within an episode # title. $inp =~ s/^"(.*)"$/$1/; # Strip any HTML tags. $inp =~ s/<[^>]+>//g; return ($inp); } # End cleanupHTML(). # This is where the magic happens. Using 'HTML::TableExtract', we try to # find the table on the Wikipedia page that contains all of the metadata # about the episodes of that season. Once this table has been identified, we # walk through each row, pick out the bits of metadata that we want to keep, # perform any necessary massaging, and insert a new entry into the # %allEpisodesHash hashtable. # # If Wikipedia ever changes their markup, this subroutine (and hence the # whole script) is likely to break. sub extractEpisodeTableFromPage { my ($epHash) = {}; # The entire HTML page. my ($html_string) = $_[0]; # For tracking what season we're on, which is a useful bit of data to attach to each episode. my ($currentSeason) = $_[1]; # Make a new TableExtract object. my ($te) = HTML::TableExtract->new( attribs => { class => "wikitable" }, keep_html => 1 ); # Extract the table. $te->parse($html_string); my ($ts) = ""; foreach $ts ($te->tables) { &DBG ("Table with class='wikitable' found at ", join(',', $ts->coords)); my ($row) = ""; foreach $row ($ts->rows) { if ($row->[1]) { # This is a 6-column row, which contains the descriptive bits about # the episode (#, writer, director, etc.) # &DBG ("details row: " . join (',', @$row)); my ($epUrl) = ""; if ($row->[0] =~ /a href="([^"]+)"/) { $epUrl = $1; &DBG ("URL: $epUrl"); } next; if ($row->[0] =~ /^#/) { # Skip the header row. &DBG ("skip header: " . $row->[0]); next; } my ($e) = ""; # row element my ($c) = 0; # column counter foreach $e (@$row) { # Strip any high-ascii characters. $e =~ s/[\x80-\xFF]//g; # Do some data fixin'. if ($details[$c] eq "Episode Title") { # Condense titles that span multiple lines down to one line. $e =~ s/\n\n/, /g; # Remove the double quotes around the title. This should # preserve double quotes that occur naturally within an episode # title. $e =~ s/^"(.*)"$/$1/; } elsif ($details[$c] eq "Episode Number") { # In wikipedia, this is '# - #', where the '-' is some unicode # bullshit. So, we just skip the characters (it's 3 bytes). # # The second number is the episode # for the season, which is # what we want. $e =~ s/^(\d+).*?(\d+)$/$2/; } elsif ($details[$c] eq "Written by") { # For some episodes (notably halloween episodes), the writers # will span multiple lines. Let's fix this up so that it looks # nice. $e =~ s/\n\n/, /g; } $epHash->{$details[$c]} = $e; $c++; } } else { # This is a description row - colspan is 6, and it contains the # episode's description. # &DBG ("episode description: " . $row->[0]); next; # Data fixin'. Remove the '[28]?' footnotes from description. The # '?' is actually some high-ascii byte. $row->[0] =~ s/\[\d+\].$//m; &DBG ("description row: " . join (',', $row->[0])); $epHash->{"Description"} = $row->[0]; $epHash->{"Season"} = $currentSeason; # Add the little episode hash that we have been building to the # master hash. if (!$allEpisodesHash{$epHash->{"Episode ID"}}) { $allEpisodesHash{$epHash->{"Episode ID"}} = $epHash; } else { # If we get here, then something is probably wrong with the data. print "******\nWARN: episode ID '" . $epHash->{"Episode ID"} . "' already exists in hash!\n******\n"; } $episodeCounter++; # Reset the single episode hash. $epHash = {}; } # row type } # foreach row } # foreach table } # End extractEpisodeTableFromPage(). # Use LWP to fetch a page from Wikipedia. Takes in a UserAgent object, and a # URL. Returns the text of the page as a scalar. sub fetchPage { my ($ua) = $_[0]; my ($url) = $_[1]; my ($req) = HTTP::Request->new (GET => $url); # Don't give me GZIP crapola. Wikipedia seems to ignore this in some # instances (particularily for Season 8). $req->header ('Accept-Encoding' => 'identity;q=1.0, gzip;q=0, compress;q=0'); $req->header ('TE' => 'gzip;q=0, deflate;q=0'); my ($res) = $ua->request ($req); if (!$res->is_success) { return ("", "ERROR: " . $res->status_line); } # This will apply any decodings (such as un-gzipping) the content returned # from the server. Once complete, this will return the full text of the # page, as a scalar. return ($res->decoded_content()); } # End fetchPage(). # Print debugging messages to STDERR, if enabled. sub DBG { my ($msg) = $_[0]; if ($DEBUG > 0) { chomp ($msg); print STDERR "DBG: $msg\n"; } } # End DBG(). # Print verbose messages to STDOUT, only if enabled. sub v { my ($msg) = $_[0]; if ($optVerbose > 0) { chomp ($msg); print "$msg\n"; } } # End v(). sub usage { print <