#!/usr/bin/perl
##############################################################################
## Perl script for capturing the essential details from every episode of
## specific seasons of "The Simpsons", from Wikipedia. This script will load
## each season page from Wikipedia, and parse out the relative details for
## each episode (title, writer, episode description, etc.) and write the
## result to a YAML file. This YAML file is useful for re-using these
## details from other programs.
##
## Requires the LWP suite, YAML::XS and the HTML::TableExtract perl modules,
## and Internet access to Wikipedia.
##
## By Andy Reitz <ajr9@po.cwru.edu>, January 17th, 2009.
## URL: http://redefine.dyndns.org/~andyr/blog/archives/2009/02/the-simpsons-on-apple-tv.html
##############################################################################

use strict;

# For fetching URLs.
use LWP::UserAgent;
use HTTP::Cookies;
use URI::Escape;
use LWP::Debug qw(level);

# For writing out the YAML file.
use YAML::XS;

# From: http://search.cpan.org/~msisk/HTML-TableExtract-2.10/lib/HTML/TableExtract.pm
use HTML::TableExtract;

# For argument parsing.
use Getopt::Long;

# Global variables.
my ($DEBUG) = 0;
my (@details) = ("Episode Number", "Episode Title", "Directed by", "Written by", "Original Airdate", "Episode ID");
my (%allEpisodesHash) = ();
my ($episodeCounter) = 0;
my ($urlBase) = 'http://en.wikipedia.org/wiki/The_Simpsons_(season_%d)';

# Set default seasons to grab.
my ($optStartSeason) = 1;
my ($optEndSeason) = 11;
# Set default output filename.
my ($optDatafile) = sprintf ("simpsons_metadb_seasons_%d-%d.yml", $optStartSeason, $optEndSeason);
my ($optVerbose) = 0;
my ($optHelp) = 0;

# Parse command-line arguments
if (!GetOptions ("datafile=s"		=> \$optDatafile,
                 "startSeason=i"	=> \$optStartSeason,
                 "endSeason=i"		=> \$optEndSeason,
                 "verbose|v+"		=> \$optVerbose,
                 "help|h|?"		=> \$optHelp,
                 # Use --debug or -d, specify multiple times to increase
                 # debugosity.
                 "debug|d+"		=> \$DEBUG)) {
  &usage();
  exit (3);
}

if ($optHelp) {
  &usage();
  exit (3);
}

if ($DEBUG > 1) {
  # Enable LWP debug messages, but only if debugging is turned to a higher
  # level.
  level ('+debug');
}

my ($ua) = LWP::UserAgent->new;
my ($url) = "";
my ($s);
for ($s = $optStartSeason; $s <= $optEndSeason; $s++) {
  # Write the current season # into the URL.
  $url = sprintf ($urlBase, $s);
  # Get it!
  my ($page) = &fetchPage($ua, $url);
  &DBG ("Page fetched from Wikipedia: $page") if ($DEBUG > 2);

  $episodeCounter = 0;

  # Has side-effects on the global %allEpisodesHash variable.
  &extractEpisodeTableFromPage($page, $s);
  &v ("Found $episodeCounter episodes for season $s.");
  # FIXME: Should add # episodes / season to the hash.
}

# Write the %allEpisodesHash out to a YAML file on disk.
&dumpYAML($optDatafile);

if ($DEBUG > 2) {
  use Data::Dumper;
  print "**** Dump of %allEpisodesHash: ****\n";
  print Dumper \%allEpisodesHash;
  print "**** End dump. ****\n";
}

# End script.
exit (0);

# Dump the %AllEpisdoesHash as YAML, to the specified file.
sub dumpYAML {
  my ($wfn) = $_[0];
  if (!open (YOUT, ">", $wfn)) {
    print "ERROR: Unable to open '$wfn' for writing!. Reason: $!\n";
    exit (3);
  }

  &v ("Writing YAML db to file '$wfn'.");
  
  print YOUT Dump \%allEpisodesHash;

  if (!close (YOUT)) {
    print "ERROR: Unable to close yaml output file. Reason: $!\n";
    exit (3);
  }
} # End dumpYAML().

# This is where the magic happens. Using 'HTML::TableExtract', we try to
# find the table on the Wikipedia page that contains all of the metadata
# about the episodes of that season. Once this table has been identified, we
# walk through each row, pick out the bits of metadata that we want to keep,
# perform any necessary massaging, and insert a new entry into the
# %allEpisodesHash hashtable.
#
# If Wikipedia ever changes their markup, this subroutine (and hence the
# whole script) is likely to break.
sub extractEpisodeTableFromPage {
  my ($epHash) = {};

  # The entire HTML page.
  my ($html_string) = $_[0];
  # For tracking what season we're on, which is a useful bit of data to attach to each episode.
  my ($currentSeason) = $_[1];
  # Make a new TableExtract object.
  my ($te) = HTML::TableExtract->new( attribs => { class => "wikitable" } );
  # Extract the table.
  $te->parse($html_string);
  my ($ts) = "";
  foreach $ts ($te->tables) {
    &DBG ("Table with class='wikitable' found at ", join(',', $ts->coords));
    my ($row) = "";
    foreach $row ($ts->rows) {
      if ($row->[1]) {
        # This is a 6-colum row, which contains the descriptive bits about
        # the episode (#, writer, director, etc.)
        &DBG ("details row: " . join (',', @$row));
        
        if ($row->[0] =~ /^#/) {
          # Skip the header row.
          &DBG ("skip header: " . $row->[0]);
          next;
        }
        my ($e) = "";	# row element
        my ($c) = 0;	# column counter
        foreach $e (@$row) {
          # Strip any high-ascii characters.
          $e =~ s/[\x80-\xFF]//g;
          
          # Do some data fixin'.
          if ($details[$c] eq "Episode Title") {
            # Condense titles that span multiple lines down to one line.
            $e =~ s/\n\n/, /g;
            
            # Remove the double quotes around the title. This should
            # preserve double quotes that occur naturally within an episode
            # title.
            $e =~ s/^"(.*)"$/$1/;
          } elsif ($details[$c] eq "Episode Number") {
            # In wikipedia, this is '# - #', where the '-' is some unicode
            # bullshit. So, we just skip the characters (it's 3 bytes).
            #
            # The second number is the episode # for the season, which is
            # what we want.
            $e =~ s/^(\d+).*?(\d+)$/$2/;
          } elsif ($details[$c] eq "Written by") {
            # For some episodes (notably halloween episodes), the writers
            # will span multiple lines. Let's fix this up so that it looks
            # nice.
            $e =~ s/\n\n/, /g;
          }
          $epHash->{$details[$c]} = $e;
          $c++;
        }
      } else {
        # This is a description row - colspan is 6, and it contains the
        # episode's description.
        
        # Data fixin'. Remove the '[28]?' footnotes from description. The
        # '?' is actually some high-ascii byte.
        $row->[0] =~ s/\[\d+\].$//m;
        &DBG ("description row: " . join (',', $row->[0]));
        $epHash->{"Description"} = $row->[0];
        $epHash->{"Season"} = $currentSeason;
        
        # Add the little episode hash that we have been building to the
        # master hash.
        if (!$allEpisodesHash{$epHash->{"Episode ID"}}) {
          $allEpisodesHash{$epHash->{"Episode ID"}} = $epHash;
        } else {
          # If we get here, then something is probably wrong with the data.
          print "******\nWARN: episode ID '" . $epHash->{"Episode ID"} . "' already exists in hash!\n******\n";
        }
        $episodeCounter++;
        # Reset the single episode hash.
        $epHash = {};
      } # row type
    } # foreach row
  } # foreach table
} # End extractEpisodeTableFromPage().

# Use LWP to fetch a page from Wikipedia. Takes in a UserAgent object, and a
# URL. Returns the text of the page as a scalar.
sub fetchPage {
  my ($ua) = $_[0];
  my ($url) = $_[1];
  my ($req) = HTTP::Request->new (GET => $url);

  # Don't give me GZIP crapola. Wikipedia seems to ignore this in some
  # instances (particularily for Season 8).
  $req->header ('Accept-Encoding' => 'identity;q=1.0, gzip;q=0, compress;q=0');
  $req->header ('TE' => 'gzip;q=0, deflate;q=0');

  my ($res) = $ua->request ($req);
  if (!$res->is_success) {
    return ("", "ERROR: " . $res->status_line);
  }
  
  # This will apply any decodings (such as un-gzipping) the content returned
  # from the server. Once complete, this will return the full text of the
  # page, as a scalar.
  return ($res->decoded_content());
} # End fetchPage().

# Print debugging messages to STDERR, if enabled.
sub DBG {
  my ($msg) = $_[0];
  if ($DEBUG > 0) {
    chomp ($msg);
    print STDERR "DBG: $msg\n";
  }
} # End DBG().

# Print verbose messages to STDOUT, only if enabled.
sub v {
  my ($msg) = $_[0];
  if ($optVerbose > 0) {
    chomp ($msg);
    print "$msg\n";
  }
} # End v().

sub usage {
  print <<EOF;

Usage: $0 [parameters]

This script will scrape Wikipedia for metadata about episdoes of "The
Simpsons", and write out the contents to a YAML file, for use with other
programs.

Parameters:
===========
  --datafile    The name of the file to write to on disk.
  --startSeason The season # to start with.
  --endSeason   The season # to stop with.
  --verbose     Print output as the script proceeds.
  --help        Display this help text.
  --debug       Turn on debugging output.

Examples:
=========
1. Grab the metadata for seasons 3 & 4, and write it to 'season3-4.yml':
   ./mk_simpsons_episode_db.pl --startSeason=3 --endSeason=4 --datafile=season3-4.yml --verbose
   Found 24 episodes for season 3.
   Found 22 episodes for season 4.
   
2. Grab seasons 1 - 11, and write to 'simpsons_metadb_seasons_1-11.yml' (the default):
   ./mk_simpsons_episode_db.pl --verbose
   Found 13 episodes for season 1.
   Found 22 episodes for season 2.
   Found 24 episodes for season 3.
   Found 22 episodes for season 4.
   Found 22 episodes for season 5.
   Found 25 episodes for season 6.
   Found 25 episodes for season 7.
   Found 25 episodes for season 8.
   Found 25 episodes for season 9.
   Found 23 episodes for season 10.
   Found 22 episodes for season 11.
   Writing YAML db to file 'simpsons_metadb_seasons_1-11.yml'.

EOF
} # End usage().