#!/usr/bin/perl

# geoupload -- upload files to geocities.com

# Copyright 2007, 2008, 2009 Kevin Ryde

# geoupload is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3, or (at your option) any later
# version.
#
# geoupload is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You can get a copy of the GNU General Public License online at
# <http://www.gnu.org/licenses/>.


use strict;
use warnings;
use 5.006; # 3-arg open
use Cwd;
use File::Basename;
use File::Spec;
use File::Path;
use Getopt::Long;
use List::Util qw(min max);

# LWP 5.818 for dump(), then 5.819 for bug fixes
use LWP 5.819;
use LWP::UserAgent;
use LWP::Debug;
use HTTP::Cookies;
use HTTP::Date;
use HTML::Form;

use HTML::TableExtract;
use Perl6::Slurp;
use WWW::Mechanize;


our $VERSION = 9;


#------------------------------------------------------------------------------
# misc

sub writefile {
  my ($filename, $contents) = @_;
  my $out;
  (open $out, '>', $filename
   and print $out $contents
   and close $out or die)
    or die "Cannot write $filename: $!";
}


#------------------------------------------------------------------------------
# startup

our $verbose = 0;
our $username;
our $password;
our $basedir;
our $throttle = 0;
my $option_action = 'upload';

GetOptions('verbose:1'  => \$verbose,
           'username=s' => \$username,
           'password=s' => \$password,
           'basedir=s'  => \$basedir,
           'throttle'   => \$throttle,
           'ls'         => sub { $option_action = 'ls' },
           'stats'      => sub { $option_action = 'stats' },
           'rm'         => sub { $option_action = 'rm' },
           'verify'     => sub { $option_action = 'verify' },
           'version'    => \&do_version,
           'help|?' => sub {
  print "geoupload [--options] filename...\n";
  print "   --help       print this help\n";
  print "   --ls         print listing of files in given directory (default toplevel)\n";
  print "   --rm         remove the given files from the server\n";
  print "   --stats      print hits statistics about the given files\n";
  print "   --verbose    describe what's done\n";
  print "   --verbose=2  describe in detail what's done\n";
  print "   --verify     download the given files to check against local copies\n";
  print "   --version    print program version number\n";
  print "Each filename given is uploaded to your web site.\n";
  print "Create a file ~/.geoupload to set your username, password and basedir, like\n";
  print "   \$username = 'my_name';\n";
  print "   \$password = 'my_passwd';\n";
  print "   \$basedir  = '/my/local/copy';\n";
  print "Then for instance\n";
  print "   geoupload /my/local/copy/index.html\n";
  exit 0;
}) or exit 1;

sub do_version {
  print "geoupload version $main::VERSION\n";
  if ($verbose) {
    print "and using\n";
    print "  WWW::Mechanize      ",WWW::Mechanize->VERSION,"\n";
    print "  LWP::UserAgent      ",LWP::UserAgent->VERSION,"\n";
    print "  HTML::TableExtract  ",HTML::TableExtract->VERSION,"\n";
  }
  exit 0;
}

if ($verbose) {
  open STDERR, '>&STDOUT'
    or die "Oops, cannot dup STDERR onto STDOUT";
}

{
  my $home = $ENV{'HOME'};
  my $filename = File::Spec->catfile ($home, '.geoupload');
  if (! defined (do $filename)) {
    if ($@) {
      print "geoupload: error in $filename\n$@\n";
    } else {
      print "geoupload: $filename: $!\n";
    }
    exit 1;
  }
  if (! defined $username) { print "\$username not set in $filename file\n"; }
  if (! defined $password) { print "\$password not set in $filename file\n"; }
  if (! defined $basedir)  { print "\$basedir not set in $filename file\n"; }
  if ($verbose) { print "Username $username, under $basedir\n"; }
}

# cache directory private perms 700 since it contains login cookies
my $cachedir = File::Spec->catdir ($basedir, '.geoupload-cache');
File::Path::mkpath ($basedir, {verbose => $verbose});
File::Path::mkpath ($cachedir, {verbose => $verbose,
                                mode => 0700});

my $cookie_filename = File::Spec->catfile ($cachedir, 'cookies.txt');
my $result_filename = File::Spec->catfile ($cachedir, 'result.html');
my $upload_filename     = File::Spec->catfile ($cachedir, 'upload.html');
my $upload_url_filename = File::Spec->catfile ($cachedir, 'upload.url');
my $verify_saved_remote = File::Spec->catfile ($cachedir, 'verify-remote');

if ($verbose >= 2) {
  LWP::Debug::level('+trace');
  LWP::Debug::level('+debug');
}

my $mech = WWW::Mechanize->new (autocheck => 1,     # fatal get(), request()
                                stack_depth => 0,   # don't need back() etc
                                onerror => \&mech_die);
$mech->cookie_jar({ file => $cookie_filename,
                    autosave => 1,
                    # yahoo cookies normally marked discard to be browser
                    # per-session only
                    ignore_discard => 1 });
push @{ $mech->requests_redirectable }, 'POST';
# $mech->max_redirect (0);
$mech->add_handler
  (request_send => sub {
     my ($req, $ua, $h) = @_;
     if ($verbose >= 2) { print "request_send:\n"; $req->dump; }
     return;
   });

my $current_action;
sub action {
  my ($a) = @_;
  $current_action = $a;
  if ($verbose) { print "$a\n"; }
}
# 'onerror' handle for $mech
sub mech_die {
  my @args = @_;
  print STDERR "$current_action error\n";
  if (my $resp = $mech->res) {
    if ($verbose >= 2) {
      print STDERR "\n",$resp->headers_as_string(),"\n";
    } elsif ($verbose) {
      print STDERR $resp->status_line,"\n";
    }
  }
  die @args;
}


#------------------------------------------------------------------------------
# login stuff

# 7 days, in seconds
my $login_maxage = 7 * 24 * 60 * 60;

# return true if we're still logged in, in particular that the last login
# wasn't too long ago
#
# HTTP::Cookies doesn't purge cookies that have reached their maxage, so
# much check the value of our special 'login_time' cookie, not just see if it
# exists
#
sub login_still_p {
  my $logged_in = 0;
  $mech->cookie_jar->scan
    (sub {
       my($version,$key,$val,$path,$domain,$port,
          $path_spec,$secure,$expires,$discard,$rest) = @_;
       if ($domain eq 'geoupload.special'
           and $key eq 'login_time') {
         my $login_time = $val + 0;
         my $age = time() - $login_time;
         if ($age >= 0 && $age < $login_maxage) {
           $logged_in = 1;
         }
       }
     });
  return $logged_in;
}

sub ensure_login {
  if (login_still_p()) { return; }

  $mech->cookie_jar->clear();

  action ('Get login page');
  $mech->get ('https://login.yahoo.com');

  action ('Logging in');
  $mech->submit_form (fields => { login  => $username,
                                  passwd => $password,
                                });

  # this redirects to a specific page
  action ('Geocities start');
  $mech->get ('http://geocities.yahoo.com/');

  action ('Manage');
  $mech->save_content ($result_filename);
  $mech->follow_link (text => 'Manage')
    or die "Start page 'Manage' link not found (see $result_filename)";

  action ('File Manager');
  $mech->save_content ($result_filename);
  $mech->follow_link (text => 'File Manager')
    or die "Manage page 'File manager' link not found (see $result_filename)";

  action ('File manager start');
  $mech->save_content ($result_filename);
  $mech->form_with_fields ('extchoice')
    or die "File manager 'Click here to open' form not found (see $result_filename)";
  $mech->click();

  action ('Upload page');
  $mech->save_content ($result_filename);
  $mech->click ('op-uploadtodir')
    or die "File manager 'Upload' button not found (see $result_filename)";
  $mech->save_content ($upload_filename);
  writefile ($upload_url_filename, $mech->uri->as_string);

  login_done();
  $mech->cookie_jar->save();
}

# add a cookie noting when logged in
sub login_done {
  my $t = time();
  $mech->cookie_jar->set_cookie
    (0,                   # version
     'login_time',        # key
     $t,                  # value
     '/',                 # path
     'geoupload.special', # domain
     12345,               # port
     1,                   # path_spec
     1,                   # secure
     $login_maxage,       # maxage
     0,                   # discard
     { 'Comment' => "logged in at @{[HTTP::Date::time2iso($t)]} (local time)"
     });
}

#------------------------------------------------------------------------------
# path stuff

sub local_to_remote_file {
  my ($local_filename) = @_;
  my $remote_filename = local_to_remote_dir ($local_filename);
  my $remote_directory = File::Basename::dirname ($remote_filename);
  $remote_filename = File::Basename::basename ($remote_filename);
  if ($remote_directory eq '.') {
    $remote_directory = '';
  }
  if ($verbose){print "    remote dir '$remote_directory' file '$remote_filename'\n";}
  return ($remote_directory, $remote_filename);
}

sub local_to_remote_dir {
  my ($local_filename) = @_;
  $local_filename = File::Spec->rel2abs ($local_filename);
  $local_filename = File::Spec->canonpath ($local_filename) ;
  $local_filename =~ s{/[^/]+/\.\./}{/};

  if (! ($local_filename =~ m{^\Q$basedir\E($|/(.*)$)}s )) {
    print "File $local_filename is not under configured basedir $basedir\n";
    exit 1;
  }
  return (defined $2 ? $2 : '');
}


#------------------------------------------------------------------------------
# upload stuff

my $throttled_speed = 3300; # bytes/second
my $throttled_time_send = 5; # seconds
my $throttled_time_idle = 5; # seconds

my $throttled_content;
my $throttled_nextsleep = 0;
sub throttled_send {
  my $chunk = substr ($throttled_content, 0,
                      $throttled_speed * $throttled_time_send);
  $throttled_content = substr ($throttled_content, length ($chunk));

  if ($throttled_nextsleep > 0 && length($chunk) > 0) {
    if ($verbose >= 2) { print "  (throttle sleep $throttled_nextsleep)\n"; }
    sleep $throttled_nextsleep;
  }
  $throttled_nextsleep = $throttled_time_send + $throttled_time_idle;

  if ($verbose >= 2) {
    print "  (throttle send ", length ($chunk), " bytes, ",
      length($throttled_content), " remaining)\n";
  }
  return $chunk;
}

sub upload {
  my ($local_filename) = @_;

  action ("Upload $local_filename");
  if ($local_filename =~ /\.tar.gz$/) {
    print "Caution: .tar.gz files may be mangled by geocities\n";
    print "You might have to name it .tar.gz.bin to force to binary download\n";
  }
  my ($remote_directory, $remote_filename)
    = local_to_remote_file ($local_filename);
  if ($verbose) { print "    to '$remote_directory' / '$remote_filename'\n"; }

  my $upload_html = Perl6::Slurp::slurp ($upload_filename);
  my $upload_url  = Perl6::Slurp::slurp ($upload_url_filename);

  my $form = HTML::Form->parse($upload_html, $upload_url)
    or die "$upload_filename not a form\n";
  $form->value('directory', $remote_directory);
  my $userfile_input = $form->find_input('userfile') or die;
  $userfile_input->file($local_filename);
  $userfile_input->filename($remote_filename);
  my $req = $form->click('op-upload');

  if ($throttle) {
    # this switch to a sending procedure leaves the 'Content-Length' header
    # in $req, which makes $ua->request() send as plain, not as chunked
    $throttled_content = $req->content();
    $req->content(\&throttled_send);
    $throttled_nextsleep = 0;  # no sleep in between multiple files
  }

  my $resp = $mech->request($req);
  $mech->save_content ($result_filename);

  if ($mech->content =~ /Uploaded successfully/) {
    if ($verbose) { print "    successful\n"; }
  } else {
    print "Upload failed: $local_filename -> $remote_directory / $remote_filename\n";
    print "See $result_filename for reason(s)\n";
    if ($verbose >= 2) { print "\n",$resp->headers_as_string(),"\n"; }
    exit 1;
  }
}

#------------------------------------------------------------------------------
# verify

my $verify_ok_count = 0;
my $verify_all_ok = 1;
sub verify {
  my ($local_filename) = @_;
  my $local_content = Perl6::Slurp::slurp ($local_filename);
  $mech->stack_depth (1); # don't save content

  action ("Verify $local_filename");
  my ($remote_directory, $remote_filename)
    = local_to_remote_file ($local_filename);
  if ($remote_directory ne '') {
    $remote_directory .= '/';
  }
  my $url = "http://www.geocities.com/$username/$remote_directory$remote_filename";
  $mech->get($url);
  if ($verbose >= 2) { print $mech->response->headers->as_string; }

  # Normally want $mech->content which is decompressed html or txt.  But if
  # there's a .gz in the filename then geocities adds "Content-Encoding:
  # gzip" which $mech->content will decompress, but we want the raw
  # compressed content in that case to compare to the local .gz compressed
  # content.
  my $remote_content = ($remote_filename =~ /\.gz$|\.gz\./
                        ? $mech->response->content
                        : $mech->content);
  my $original_remote_content = $remote_content;

  if ($mech->ct eq 'text/html') {
    $remote_content =~ s/\r?\n<!-- following code added by server.*\r?\n.*?<!-- preceding code added by server.*//
      or do {
        if ($verbose) {
          print "$local_filename: oops, geocities header not matched\n";
        }
      };
    $remote_content =~ s/^<!-- text below generated by server.*//sm
      or do {
        if ($verbose) {
          print "$local_filename: oops, geocities footer not matched\n";
        }
      };
  }

  if ($remote_content eq $local_content) {
    if ($verbose) { print "$local_filename: ok\n"; }
    $verify_ok_count++;
  } else {
    print "$local_filename: different\n";
    print "  local size ",length($local_content),
      "  remote size ",length($remote_content), "\n";
    print "  remote content saved in $verify_saved_remote\n";
    writefile ($verify_saved_remote, $original_remote_content);
    $verify_all_ok = 0;
  }
}


#------------------------------------------------------------------------------
# ls listing

sub ls {
  my ($local_directory) = @_;

  action ("Listing $local_directory");
  my $remote_directory = local_to_remote_dir ($local_directory);
  if ($verbose) { print "    remote '$remote_directory'\n"; }

  mech_dir_get ($remote_directory);
  my $content = $mech->content;

  my $te = HTML::TableExtract->new
    (headers => ['Name', 'Last Modified', 'Size'],
     slice_columns => 0);

  $te->parse ($content);
  my $ts = $te->first_table_found
    || die "File manager listing table not matched (see $result_filename for contents)";

  my @table;
  foreach my $row ($ts->rows) {
    my ($tickbox, $subdir, $filename, $link, $lastmod, undef, $size) = @$row;
    if (defined $filename) {
      push @table, [ $filename, $lastmod, "$size kb" ];
    } elsif (defined $subdir) {
      $subdir =~ tr/\x{A0}/ /;   # &nbsp
      $subdir =~ s/^\s+|\s+$//g; # leading/trailing whitespace
      $subdir .= "/";
      push @table, [ $subdir, $lastmod, '' ];
    }
  }

  my $name_width = max (10, map {length($_->[0])} @table);
  my $date_width = max (10, map {length($_->[1])} @table);
  foreach my $elem (@table) {
    printf("%-*s  %-*s  %s\n",
           $name_width, $elem->[0],
           $date_width, $elem->[1],
           $elem->[2]);
  }
}

sub mech_dir_get {
  my ($remote_directory) = @_;
  action ("Directory '$remote_directory'");
  my $url;
  if ($remote_directory eq '') {
    $url = 'http://geocities.yahoo.com/filemanager';
  } else {
    $url = "http://geocities.yahoo.com/filemanager?directory="
      . URI::Escape::uri_escape($remote_directory);
  }
  my $resp = $mech->get ($url);
  $mech->save_content ($result_filename);
  return $resp;
}

#------------------------------------------------------------------------------
# stats

sub stats {
  my @local_filenames = @_;
  ensure_login ();

  print "Total  Highest monthly        File\n";
  #          8  1234 (September 2008)

  foreach my $local_filename (@local_filenames) {
    action ("Stats $local_filename");
    my ($remote_directory, $remote_filename)
      = local_to_remote_file ($local_filename);

    my $url = "http://geocities.yahoo.com/stats?fn=$remote_directory/$remote_filename&ops=100&domain=";
    $mech->get($url);
    $mech->save_content ($result_filename);
    my $content = $mech->content;

    my $te = HTML::TableExtract->new (headers => [qr/Total page views/i]);
    $te->parse($content);
    if (! $te->tables) {
      if ($content =~ /No data is recorded/i) {
        printf ("<no data recorded>            %s\n", $local_filename);
        next;
      }
      print "geoupload: status headings for $local_filename not recognised (see $result_filename for contents)\n";
      next;
    }
    my @tables = $te->tables;
    $te = HTML::TableExtract->new (depth => $tables[0]->depth,
                                   count => $tables[0]->count);
    $te->parse($content);

    my $total = '';
    my $highest = '';
    foreach my $row ($te->rows) {
      my ($name, $value) = @$row;
      if (! defined $name) { next; }
      if (! defined $value) { $value = ''; }
      $value =~ s/^\s+|\s+$//g; # leading/trailing whitespace
      if ($name =~ /Total\s+page\s+views/i) {
        $total = $value;
      } elsif ($name =~ /Highest\s+monthly/i) {
        $highest = $value;
      }
    }

    $highest =~ /^(\d+)/;
    my $pad = max (0, 5 - length $1);
    $highest = (' ' x $pad) . $highest;

    printf ("%5s %-23s %s\n", $total, $highest, $local_filename);
  }
}


#------------------------------------------------------------------------------
# rm remove

sub rm {
  my ($local_filename) = @_;

  action ("Remove $local_filename");
  my ($remote_directory, $remote_filename)
    = local_to_remote_file ($local_filename);

  mech_dir_get ($remote_directory);
  $mech->form_name ('dispfiles')
    or die "Directory page 'dispfiles' form not found";
  $mech->tick ('filename', $remote_filename);
  $mech->click ('op-deletefile')
    or die "Directory page 'Delete' button not found";

  $mech->save_content ($result_filename);
  $mech->form_number (1)
    or die "Directory page 'dispfiles' form not found";
  $mech->click_button (value => 'Delete Files');

  $mech->save_content ($result_filename);
}

#------------------------------------------------------------------------------
# main

if ($option_action eq 'ls') {
  ensure_login ();
  if (! @ARGV) { push @ARGV, getcwd(); }  # default current dir
  foreach (@ARGV) {
    ls ($_);
  }

} elsif ($option_action eq 'rm') {
  ensure_login ();
  foreach (@ARGV) {
    rm ($_);
  }

} elsif ($option_action eq 'stats') {
  stats (@ARGV);

} elsif ($option_action eq 'verify') {
  if ($#ARGV < 0) {
    print "No files to verify\n";
    exit 1;
  }
  foreach (@ARGV) {
    verify ($_);
  }
  print "$verify_ok_count files ok\n";
  if (! $verify_all_ok) { exit 1; }

} else {
  if (@ARGV == 0) {
    print "No files to upload\n";
    exit 1;
  }
  ensure_login ();
  foreach (@ARGV) {
    upload ($_);
  }
}

exit 0;

__END__



=head1 NAME

geoupload -- upload files to Geocities

=head1 SYNOPSIS

geoupload [--options] filename...

=head1 DESCRIPTION

B<geoupload> uploads files to your Geocities hosted web site
L<http://www.geocities.com>, using its "file manager" web interface.

Create your content locally with the same directory structure as the site,
and setup a F<~/.geoupload> file with your username, password, and local
base directory.  For example

    $username = 'my_username';
    $password = 'secretpassword';
    $basedir  = '/where/i/keep/my/files';

This is actually perl code, so you can put comment lines with C<#> or use
C<< $ENV{HOME} >> or write some conditionals or whatever.  Then for instance
to upload an F<index.html> page

    geoupload /where/i/keep/my/files/index.html

or upload all html pages

    cd /where/i/keep/my/files
    geoupload *.html

Subdirectories under the basedir are supported, but there's no automatic way
to create them (yet), you have to login manually and do that.

B<geoupload> works by clicking through the various login pages to get to
the File Manager upload, the same as you'd do manually with a web browser,
but using the perl LWP module.  (Incidentally, LWP does a good job on
binaries, it can usually send them as 8-bit bytes, where for instance
Mozilla conservatively expands to binhex.)

B<geoupload> keeps a F<.geoupload-cache> directory in your local
C<$basedir>, with saved login cookies and the upload form, so once logged in
each subsequent upload is just a single HTTP request.  A login seems to last
a week or so, but if you have trouble try deleting the whole cache directory
to make a clean start.

=head1 OPTIONS

The command line options are

=over 4

=item --help

Print some brief help information.

=item --ls [dir...]

Print a listing of files at geocities in the given directories, or the
current directory by default.  Eg.

    geoupload --ls

=item --rm filename...

Remove the given files from the geocities server.  (A directory cannot be
removed, yet).

=item --stats filename...

Print hits statistics about the named files.

=item --verbose

Print some diagnostics about what's being done.  With --verbose=2 print
some technical details too.  Eg.

    geoupload --verbose myfile.html

=item --verify

Don't upload, instead download the named files and check they're the same as
the local copies.

For HTML files C<geoupload> attempts to undo the advertising inserted by the
geocities server before comparing.  This might be a bit fragile and so might
report a HTML file as different when it's in fact ok.

=item --version

Print the geoupload program version number.  With C<--verbose> also print
some of the module version numbers.

=back

=head1 PREREQUISITES

The modules and packages required are

    WWW::Mechanize
    LWP
    HTML::TableExtract
    Perl6::Slurp
    Crypt::SSLeay or IO::Socket::SSL

The SSL modules are per the LWP F<README.SSL> file, either one suffices.

=head1 FILES

=over 4

=item F<~/.geoupload>

Configuration file.

=item F<$basedir/.geoupload-cache/*>

Various cached files.  Delete them to force a fresh login, if perhaps
something fishy seems to be happening.

=back

=head1 BUGS

Uploads to a non-existant subdirectory are sent, but silently do nothing.
The error from C<--ls> in a non-existant subdirectory is obscure too.

C<--rm> of a non-existant file die with some obscure Mechanize messages.

=head1 SEE ALSO

L<LWP>

=head1 HOME PAGE

L<http://www.geocities.com/user42_kevin/geoupload/index.html>

=head1 LICENSE

Copyright 2007, 2008, 2009 Kevin Ryde

geoupload is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.

geoupload is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You can get a copy of the GNU General Public License online at
L<http://www.gnu.org/licenses/>.

=cut
