#!/usr/bin/perl
##############################################################################
## Perl script for capturing the essential details from every episode of the
## TV show "Angel", from Wikipedia.  This script will load the list of Angel
## Episodes from Wikipedia, and then drill down into the specific page for
## each episode.  From this page, it will parse out the relative details for
## each episode (title, writer, episode description, etc.) and write the
## result to a YAML file.  This YAML file is useful for re-using these
## details from other programs.
##
## Requires the LWP suite, YAML::XS and the HTML::TableExtract perl modules,
## and Internet access to Wikipedia.
##
## By Andy Reitz <ajr9@po.cwru.edu>, September 7th, 2009.
## URL: http://redefine.dyndns.org/~andyr/blog/archives/2009/09/angel-in-my-itunes.html
##############################################################################

use strict;

# For fetching URLs.
use LWP::UserAgent;
use HTTP::Cookies;
use URI::Escape;
use LWP::Debug qw(level);

# For writing out the YAML file.
use YAML::XS;

# From: http://search.cpan.org/~msisk/HTML-TableExtract-2.10/lib/HTML/TableExtract.pm
use HTML::TableExtract;

# For argument parsing.
use Getopt::Long;

# For debugging.
use Data::Dumper;

# Global variables.
my ($DEBUG) = 0;
my (@details) = ("Episode Number", "Episode Title", "Directed by", "Written by", "Original Airdate", "Episode ID");
my (%allEpisodesHash) = ();
my (%descriptions) = ();
my ($episodeCounter) = 0;
my ($urlBase) = 'http://en.wikipedia.org/wiki/List_of_Angel_episodes';

# List of relative URLs, parsed from the main 'List of Angel Episodes' page.
my (@episodeURLs) = ();

# Set default output filename.
my ($optDatafile) = "angel_metadb_seasons_1-5.yml";
my ($optVerbose) = 0;
my ($optHelp) = 0;

# Parse command-line arguments
if (!GetOptions ("datafile=s"		=> \$optDatafile,
                 "verbose|v+"		=> \$optVerbose,
                 "help|h|?"		=> \$optHelp,
                 # Use --debug or -d, specify multiple times to increase
                 # debugosity.
                 "debug|d+"		=> \$DEBUG)) {
  &usage();
  exit (3);
}

if ($optHelp) {
  &usage();
  exit (3);
}

if ($DEBUG > 1) {
  # Enable LWP debug messages, but only if debugging is turned to a higher
  # level.
  level ('+debug');
}

my ($ua) = LWP::UserAgent->new;

# Get it!
&DBG ("Fetching URL: '$urlBase'");
my ($page) = &fetchPage($ua, $urlBase);
&DBG ("Page fetched from Wikipedia: $page") if ($DEBUG > 2);

$episodeCounter = 0;

&extractEpisodeURLs($page);

my ($x);
for ($x = 0; $x < scalar(@episodeURLs); $x++) {
  if ($episodeURLs[$x] =~ m|^/wiki/Corrupt|) {
    # Skip the unproduced episode.
    next;
  }
  &extractDetailsFromEpisodePage ($episodeURLs[$x]);
}

&v ("Found $episodeCounter episodes.");
# FIXME: Should add # episodes / season to the hash.

# Write the %allEpisodesHash out to a YAML file on disk.
&dumpYAML($optDatafile);

if ($DEBUG > 2) {
  print "**** Dump of %allEpisodesHash: ****\n";
  print Dumper \%allEpisodesHash;
  print "**** End dump. ****\n";
}

# End script.
exit (0);

# Dump the %AllEpisdoesHash as YAML, to the specified file.
sub dumpYAML {
  my ($wfn) = $_[0];
  if (!open (YOUT, ">", $wfn)) {
    print "ERROR: Unable to open '$wfn' for writing!. Reason: $!\n";
    exit (3);
  }

  &v ("Writing YAML db to file '$wfn'.");
  
  print YOUT Dump \%allEpisodesHash;

  if (!close (YOUT)) {
    print "ERROR: Unable to close yaml output file. Reason: $!\n";
    exit (3);
  }
} # End dumpYAML().

##
## Harvests URLs of each Angel episode from the "List of Angel Episodes"
## page.
##
## Additionally, episode descriptions are harvested from this page, and
## stored in the global $descriptions hash reference.
##
sub extractEpisodeURLs {

  # The entire HTML page.
  my ($html_string) = $_[0];
  # Make a new TableExtract object.
  my ($te) = HTML::TableExtract->new( attribs => { class => "wikitable" }, keep_html => 1 );
  # Extract the table.
  $te->parse($html_string);
  my ($ts) = "";
  my ($prevEpUrl) = "";
  foreach $ts ($te->tables) {
    &DBG ("extractEpisodeURLs(): Table with class='wikitable' found at ", join(',', $ts->coords));
    my ($row) = "";
    foreach $row ($ts->rows) {
      # details row
      if ($row->[1]) {
        my ($epUrl) = "";
        if ($row->[2] =~ /a href="\/wiki\/([^"]+)"/) {
          $epUrl = "/wiki/" . $1;
#          &DBG ("URL: $epUrl");
          push @episodeURLs, $epUrl;
          $prevEpUrl = $epUrl;
        }
      } else {
        # description row
#        &DBG("saving description for '$prevEpUrl':\n" . $row->[0] . "\n*****");
        $descriptions{$prevEpUrl} = &cleanupHTML($row->[0]);
      }
    }
  }
} # End extractEpisodeURLs().

##
## Given a relative URL to an Angel episode page on Wikipedia, this routine
## will extract the relevant metadata from the info box.
##
## Data is stored in the global '$epHash' hash reference.
##
sub extractDetailsFromEpisodePage {
  my ($page) = $_[0];
  my ($url) = "http://en.wikipedia.org$page";
  
  &DBG ("Fetching page '$url'");

  # $ua is global.
  my ($episodePage) = &fetchPage($ua, $url);
  
  # Make a new TableExtract object.
  my ($te) = HTML::TableExtract->new( attribs => { class => "infobox vevent" }, decode => 0 );
  # Extract the table.
  $te->parse($episodePage);
  my ($ts) = "";

  my ($epHash) = {};

  foreach $ts ($te->tables) {
    &DBG ("Table with class='infobox vevent' found at ", join(',', $ts->coords));
    my ($row) = "";
    my ($firstRow) = 1;
    foreach $row ($ts->rows) {
      &DBG ("infobox row: " . join ('=', @$row));
      if ($firstRow) {
        $epHash->{"Episode Title"} = &cleanupHTML ($row->[0]);
        &v ("Parsed episode: " . $epHash->{"Episode Title"});
        $firstRow = 0;
        next;
      }

      if ($row->[0] =~ /Written by/) {
        $epHash->{"Written by"} = &cleanupHTML ($row->[1]);
        &DBG ("Written by: " . $epHash->{"Written by"});
      } elsif ($row->[0] =~ /Directed by/) {
        $epHash->{"Directed by"} = &cleanupHTML ($row->[1]);
        &DBG ("Directed by: " . $epHash->{"Directed by"});
      } elsif ($row->[0] =~ /Production/) {
        $epHash->{"Episode ID"} = $row->[1];
        &DBG ("Episode ID: " . $row->[1]);
      } elsif ($row->[0] =~ /Original.*airdate/) {
        $epHash->{"Original Airdate"} = $row->[1];
        &DBG ("Original Airdate: " . $row->[1]);
      } elsif ($row->[0] =~ /^Episode/) {
        if ($row->[1] =~ /Season[&][#]160;(\d+).*Episode[&][#]160;(\d+)/ms) {
          $epHash->{"Season"} = $1;
          &DBG ("Season: $1");
          $epHash->{"Episode Number"} = $2;
          &DBG ("Episode #: $2");
        }
      }
    }
  }

  &DBG ("Setting description for page '$page':\n" . $descriptions{$page});
  $epHash->{"Description"} = $descriptions{$page};

  # Add the little episode hash that we have been building to the
  # master hash.
  if (!$allEpisodesHash{$epHash->{"Episode ID"}}) {
    $allEpisodesHash{$epHash->{"Episode ID"}} = $epHash;
  } else {
    # If we get here, then something is probably wrong with the data.
    print "******\nWARN: episode ID '" . $epHash->{"Episode ID"} . "' already exists in hash!\n******\n";
  }
  $episodeCounter++;
  # Reset the single episode hash.
  $epHash = {};
  
  &DBG (" --- *** ---\n\n");
} # End extractDetailsFromEpisodePage().

##
## A series of regex hacks to cleanup the data that we get from Wikipedia. I
## probably should do something a lot more robust here.
##
sub cleanupHTML() {
  my ($inp) = $_[0];

  # Convert HTML ampersand entities.
  $inp =~ s/&amp;/&/g;
  # Don't know why Wikipedia is inserting these elements, but the browser
  # renders it as a space.
  $inp =~ s/&#160;/ /g;
  # Collapse multiple lines down to one.
  $inp =~ s/\n/ /g;
  if ($inp !~ /[(]/) {
    # Text doesn't have any open paren's in it, so let's collapse double
    # spaces (and more) down to a comma-separated list.
    $inp =~ s/\s\s+/, /g;
  } else {
    # Collapse spaces, and put comma's after closing parens.
    $inp =~ s/\s\s+/ /g;
    $inp =~ s/[)] /), /g;
  }
  # Strip any high-ascii characters.
  $inp =~ s/[\x80-\xFF]//g;

  # Remove the double quotes around the title. This should
  # preserve double quotes that occur naturally within an episode
  # title.
  $inp =~ s/^"(.*)"$/$1/;

  # Strip any HTML tags.
  $inp =~ s/<[^>]+>//g;

  return ($inp);
} # End cleanupHTML().

# This is where the magic happens. Using 'HTML::TableExtract', we try to
# find the table on the Wikipedia page that contains all of the metadata
# about the episodes of that season. Once this table has been identified, we
# walk through each row, pick out the bits of metadata that we want to keep,
# perform any necessary massaging, and insert a new entry into the
# %allEpisodesHash hashtable.
#
# If Wikipedia ever changes their markup, this subroutine (and hence the
# whole script) is likely to break.
sub extractEpisodeTableFromPage {
  my ($epHash) = {};

  # The entire HTML page.
  my ($html_string) = $_[0];
  # For tracking what season we're on, which is a useful bit of data to attach to each episode.
  my ($currentSeason) = $_[1];
  # Make a new TableExtract object.
  my ($te) = HTML::TableExtract->new( attribs => { class => "wikitable" }, keep_html => 1 );
  # Extract the table.
  $te->parse($html_string);
  my ($ts) = "";
  foreach $ts ($te->tables) {
    &DBG ("Table with class='wikitable' found at ", join(',', $ts->coords));
    my ($row) = "";
    foreach $row ($ts->rows) {
      if ($row->[1]) {
        # This is a 6-column row, which contains the descriptive bits about
        # the episode (#, writer, director, etc.)
#        &DBG ("details row: " . join (',', @$row));
        my ($epUrl) = "";
        if ($row->[0] =~ /a href="([^"]+)"/) {
          $epUrl = $1;
          &DBG ("URL: $epUrl");
        }
        
        next;
                
        if ($row->[0] =~ /^#/) {
          # Skip the header row.
          &DBG ("skip header: " . $row->[0]);
          next;
        }
        my ($e) = "";	# row element
        my ($c) = 0;	# column counter
        foreach $e (@$row) {
          # Strip any high-ascii characters.
          $e =~ s/[\x80-\xFF]//g;
          
          # Do some data fixin'.
          if ($details[$c] eq "Episode Title") {
            # Condense titles that span multiple lines down to one line.
            $e =~ s/\n\n/, /g;
            
            # Remove the double quotes around the title. This should
            # preserve double quotes that occur naturally within an episode
            # title.
            $e =~ s/^"(.*)"$/$1/;
          } elsif ($details[$c] eq "Episode Number") {
            # In wikipedia, this is '# - #', where the '-' is some unicode
            # bullshit. So, we just skip the characters (it's 3 bytes).
            #
            # The second number is the episode # for the season, which is
            # what we want.
            $e =~ s/^(\d+).*?(\d+)$/$2/;
          } elsif ($details[$c] eq "Written by") {
            # For some episodes (notably halloween episodes), the writers
            # will span multiple lines. Let's fix this up so that it looks
            # nice.
            $e =~ s/\n\n/, /g;
          }
          $epHash->{$details[$c]} = $e;
          $c++;
        }
      } else {
        # This is a description row - colspan is 6, and it contains the
        # episode's description.
#        &DBG ("episode description: " . $row->[0]);
        next;
        
        # Data fixin'. Remove the '[28]?' footnotes from description. The
        # '?' is actually some high-ascii byte.
        $row->[0] =~ s/\[\d+\].$//m;
        &DBG ("description row: " . join (',', $row->[0]));
        $epHash->{"Description"} = $row->[0];
        $epHash->{"Season"} = $currentSeason;
        
        # Add the little episode hash that we have been building to the
        # master hash.
        if (!$allEpisodesHash{$epHash->{"Episode ID"}}) {
          $allEpisodesHash{$epHash->{"Episode ID"}} = $epHash;
        } else {
          # If we get here, then something is probably wrong with the data.
          print "******\nWARN: episode ID '" . $epHash->{"Episode ID"} . "' already exists in hash!\n******\n";
        }
        $episodeCounter++;
        # Reset the single episode hash.
        $epHash = {};
      } # row type
    } # foreach row
  } # foreach table
} # End extractEpisodeTableFromPage().

# Use LWP to fetch a page from Wikipedia. Takes in a UserAgent object, and a
# URL. Returns the text of the page as a scalar.
sub fetchPage {
  my ($ua) = $_[0];
  my ($url) = $_[1];
  my ($req) = HTTP::Request->new (GET => $url);

  # Don't give me GZIP crapola. Wikipedia seems to ignore this in some
  # instances (particularily for Season 8).
  $req->header ('Accept-Encoding' => 'identity;q=1.0, gzip;q=0, compress;q=0');
  $req->header ('TE' => 'gzip;q=0, deflate;q=0');

  my ($res) = $ua->request ($req);
  if (!$res->is_success) {
    return ("", "ERROR: " . $res->status_line);
  }
  
  # This will apply any decodings (such as un-gzipping) the content returned
  # from the server. Once complete, this will return the full text of the
  # page, as a scalar.
  return ($res->decoded_content());
} # End fetchPage().

# Print debugging messages to STDERR, if enabled.
sub DBG {
  my ($msg) = $_[0];
  if ($DEBUG > 0) {
    chomp ($msg);
    print STDERR "DBG: $msg\n";
  }
} # End DBG().

# Print verbose messages to STDOUT, only if enabled.
sub v {
  my ($msg) = $_[0];
  if ($optVerbose > 0) {
    chomp ($msg);
    print "$msg\n";
  }
} # End v().

sub usage {
  print <<EOF;

Usage: $0 [parameters]

This script will scrape Wikipedia for metadata about episodes of "Angel",
and write out the contents to a YAML file, for use with other programs.

Parameters:
===========
  --datafile    The name of the file to write to on disk.
  --verbose     Print output as the script proceeds.
  --help        Display this help text.
  --debug       Turn on debugging output.

Examples:
=========
1. Grab metadata for all episodes of Angel, and write to 'angel_metadb_seasons_1-5.yml' (the default):
   ./mk_angel_episode_db.pl --verbose
   Parsed episode: City Of
   Parsed episode: Lonely Hearts
   Parsed episode: In the Dark
   ....
   Parsed episode: Power Play
   Parsed episode: Not Fade Away
   Found 110 episodes.
   Writing YAML db to file 'angel_metadb_seasons_1-5.yml'.

EOF
} # End usage().