#!/usr/bin/perl


#Current version: v1.0b
#	Changes since 1.0:
#		--Fixed a typo that made strictness break the script on unix.
#		--Fixed misuse of mkdir() that broke the script on Solaris.
#	Changes since last version:
#		--NOW ABLE TO DOWNLOAD MORE THAN ONE FILE SIMULTANEOUSLY! Use can specify the number of simultaneous downloads with the -t option, which was meant to stand for "threads" though I wound up using Perl's fork instead. Still, you get the idea.
#		--Took the '=' chars out of the wget call so as to please more shells.
#		--Rearranged/Cleaned much code
#		--Code-testing with -s works ok now.
#		--Added the ability to take ezcodes from a file with -i either with a code on the commandline, or alone. It'll just make things easier to just cut/paste from a list of EZCodes this way.
#		--Also made it possible to include multiple EZCodes on the commandline.
#		--Modified the behaviour of -r for renaming so that it still places the pictures into their subdirectory, leaving -d to make decisions about directories.
#		--Added an option -p to specify not to put pictures into a subdirectoy. Meant to be used in conjunction with -r, but this is not necessary.
#
#CREDIT AND CONTACT:
#		This script written by derMoerder. You may contact me via ICQ @ 23115071, or just find me on the SomethinAwful Forums(or DPPHv2) as derMoerder.
#
#NOTE:
#		This script will assume you have wget in your path, or in the same directory as this script, and that either your OS support Unix fork, or your Perl emulates it correctly.
#		Wget must be at least v1.6.
#


use strict;
use Getopt::Std;
use vars qw( $opt_c $opt_r $opt_t $opt_s $opt_i $opt_p ); #c is the cutoff filesize in kilobytes, r is a bool of whether to rename according to the last ezfield or not(as opposed to using subdirs), and t is the number of downloading threads to keep going concurrently.


my $VERSION = "v1.0b";
my $EMAIL = "derM\@eden.rutgers.edu";


main();


sub main {
	my %optDef = (	c => 10, #def cutoff is 10KB
					r => 0,	#def behaviour is to put downloads in a subdir named by the last field of the ezcode
					t => 7, #def number of simultaneous downloads
					s => 0, #def behavior is to actually download files
					i => "", #def behaviour is _not_ to have ezcodes input from a file
					p => 0 );#def behaviour is to put the files into a dir
	my @ezcodes; #holds codes from cmdline and, according to options, from a text file. both can be used at once.
	my $ezcode;
	my @parsedCodes; #chunked-up codes returned from EzParse()
	my @activeCode; #holds the code we're currently using from @parsedCodes after we take the stuff of of it
	my ( $i, $k, $j ); #loopvar
	my $spiderOpt; #we need this for the wget call, depending on whether or not we're actually downloading.
	my ( $pid, %pids ); #hash to hold $opt_t pids at a time for keeping track of our fork()s
	my ( $fileName, $fileNameR ); #in our downloading, we use these when calling wget.. local and remote filenames
	my ( @numTemp1, @numTemp2 ); #will hold the signifigant digit sets(the sets that change in order) from the beg and end filenames, and length will be used to compute '0' padding


	getopts( 'c:rt:sdi:' );


		#if the key wasn't given a value on the command-line, we'll set it to default
	if( !defined $opt_c ) { $opt_c = $optDef{ 'c' }; }
	if( !defined $opt_r ) { $opt_r = $optDef{ 'r' }; }
	if( !defined $opt_t ) { $opt_t = $optDef{ 't' }; }
	if( !defined $opt_s ) { $opt_s = $optDef{ 's' }; }
	if( !defined $opt_i ) { $opt_i = $optDef{ 'i' }; }
	if( !defined $opt_p ) { $opt_p = $optDef{ 'p' }; }


	if( ( $#ARGV < 0 && $opt_i eq $optDef{ 'i' } ) || ( $ARGV[ $#ARGV ] eq "--help" ) ) { #check for help option
		PrintHelp();
	}#if


	if( $opt_i ne '' ) { #if we have a file for input
		if( !open( EZCODES, $opt_i ) ) {
			print "Couldn't open file \"$opt_i\" for EZCode extraction.\n";
			if( $#ARGV < 0 ) { #if there's nothing in @ARGV, and we can't get any ezcodez from the specified file, then there are no codes and we should exit()
				print "There are no EZCodes given, quitting.\n";
				exit();
			}#if
		} else {
			while( <EZCODES> ) { #go line-by-line through our file looking for ezcodes
				$_ =~ s/\s*$//;
				$_ =~ s/^\s*//;
				push @ezcodes, $_;
			}#while
			close( EZCODES );
		}#if/else
	}#if
	while( @ARGV ) { #while there are ezcodes left in @ARGV after the switches have been taken care of
		push( @ezcodes, shift( @ARGV ) ); #shove codes one by one onto the end of our ezcodes list
	}#while

		#check bottom^1 for my ramblings on the following behaviour here
	foreach $ezcode( @ezcodes ) {
		push( @parsedCodes, EzParse( $ezcode ) );
	}#foreach
	@ezcodes = ""; #clear up some memory, we don't need these anymore now that we have them chunked up for use in @parsedCodes
	#just a little reference for myself... here's what was returned from EzParse() for each code:
	# 0 - $bShowPadding, 1 - $startNum, 2 - $endNum, 3 - $fileBeg,
	# 4 - $fileBegR, 5 - $fileEnd, 6 - $extension, 7 - $dirPrefix, 8 - $rdir

	@parsedCodes || die( "No valid EZCodes found, quitting EZPicspl.\n" ); #check that there are any codes returned from the parsing


	while( @parsedCodes ) { #while there's a code left in the array
		undef @activeCode; #clear out the active code
		for( $j=0; $j<9; $j++ ) { #now shift off the chunks for the next ezcode
			push @activeCode, ( shift @parsedCodes );
		}#for


		if( !$opt_s ) {
			if( $opt_p ) { #if they wanna put the files into curdir, rather than subdir
				$activeCode[ 7 ] = "";
			} else {
				mkdir( substr( $activeCode[ 7 ], 0, ( length $activeCode[ 7 ] ) - 1 ), 700 ); # $dirPrefix is all set up with a '/' on the end.. we don't need that in the call to mkdir() so we just take a substr up to it
			}#if/else
		} else {
			$spiderOpt = "--spider";
			$activeCode[ 7 ] = "";
		}#if/else


		for( $i=$activeCode[ 1 ]+0; $i<$activeCode[ 2 ] + 1; $i++ ) { #the +0 is on the startNum so that it's treated as an integer with any 0-padding chopped off.. without this ezpcispl won't work with sets numbered like 012-199
			$fileName = ""; #empty it out, we'll be concing the filename onto this
			$fileNameR = "";
			$fileName .= $activeCode[ 3 ];
			$fileNameR .= $activeCode[ 4 ];
			if( $activeCode[ 0 ] ) {
				@numTemp1 = split( //, $i );
				@numTemp2 = split( //, $activeCode[ 2 ] );
				for( $k=0; $k<$#numTemp2 - $#numTemp1; $k++ ) {	#won't insert 0s unless padding is necessary
					$fileName .= '0';
					$fileNameR .= '0';
				}#for
			}#if
			$fileName .= "$i$activeCode[ 5 ]";
			$fileNameR .= "$i$activeCode[ 5 ]";
			if( ( scalar keys %pids ) < $opt_t ) { #if we have room in the pid pool
				if( !defined( $pid = fork() ) ) {
					#failed fork
					die "Couldn't fork... If you're seeing this message, you're on an operating system\nthat doesn't support or a Perl that doesn't emulate Unix's c fork() function.\nPlease e-mail the author at $EMAIL, and give your OS and what Perl\nyou're using.\nQuitting EZPicspl.\n";
					exit( -1 );
				} elsif( $pid == 0 ) {
					#child
					system( "wget -nv --output-document \"$activeCode[ 7 ]$fileName\" $spiderOpt -nd -A \"$activeCode[ 6 ]\" --referer \"$activeCode[ 8 ]$fileName\" --follow-ftp --user-agent \"MSIE 6.66 (X11; I; Etch-A-Sketch 5.4)\"  \"$activeCode[ 8 ]$fileNameR\"" );
					print "\n";
					CleanUp( $activeCode[ 7 ], $fileName );
					exit( 0 );
				} else {
					#parent
					print "Trying \"$fileNameR\", working towards \"$activeCode[ 4 ]$activeCode[ 2 ]$activeCode[ 5 ]\"\n";
					$pids{ $pid } = ""; #make a key in the hash for our new pid
				}#if/else
			} else { #else wait for an open spot, and set up to try to find a place in the pool for this download again
				if( ( $pid = wait() ) == -1 ) { #no children.. let's check if we're done or not
					if( @parsedCodes ) { #make sure there's codes left to be parsed before clearing.. 
						undef %pids; #since it says there's no children, make sure this is empty so we can fill 'er up again
					}
				} else { #else we've got a new pid to pull out of the pool
					delete $pids{ $pid }; #remove the key associated with the next process to end, freeing a spot for our next download
					$i--; #decrement $i so we try the same file again(it didn't go this time, apparently, since we wound up here)
				}#if/else
			}#if/else
		}#for
	}#while
	

}#main


sub EzParse {
	#@_............................[ 0 ] the ezcode to parse.
	my @splitCode; #when we split the ezcode into its parts
	my ( @partsBeg, @partsEnd ); #the parts of the begginning and ending filenames once it's split up by digit sets
	my ( $rdir, $files );	#when we break up the EZCode part of the url.. these are the key parts
	my ( $fileBeg, $fileBegR, $fileEnd ); #holds the portions of the filenames before and after the changing set of digits, to be tacked on each side for the download once we're done parsing
	my ( $startNum, $endNum ); #holds the numbers with which to begin and end the ordered downloading.
	my $bShowPadding = 0; #necessary in some conditionals later do deal with some software/people that put 0's as padding in lower numbers.. ala 015.jpg
	my ( $fileName, $fileNameR ); #the filename after the path, and the filename of the remote file.. will be different if the user sets $opt_r.
	my ( $i, $k ); #loopvars
	my $changingSet; #will hold index in @partsX of  the sets of digits that increment
	my $extension; #will be extraced from $fileEnd for use in the wget parameters..
	my @splitFileName; #for holding the split filename before putting the extension into $extension
	my $dirPrefix;


	$_[ 0 ] =~ m|^((\w)+://(.*))/([^/]+)$|;	#so... $1 will be the url with path up to the filenames, and $4 will be the ezcoded filename part(filestart.ext?fileend.ext?name).
	$rdir = "$1/"; $files = $4;

	@splitCode = split( /\?/, $files ); #split the code into startfile(0) endfile(1) and subdir(2)

	#now we'll need to check for how many sets of digits are in the filename, and see which set of digits changes.. we assume that the last changing set of numbers is the one we want to iterate when downloading, since there's no real way to allow for other behaviour with the way EZCodes are currently formatted.
		#stick all digits blocks from the startfile into @numSets .... this was originally m/\D(\d+)\D/g, but then i realized some wierdo(or some wierd server) might name without the extensions which would have confused it if i left it this way.
	@partsBeg = ( split( /(\d+)/, $splitCode[ 0 ] ) );
	@partsEnd = ( split( /(\d+)/, $splitCode[ 1 ] ) );

	$changingSet = -1;
	for( $i=0; $i<$#partsBeg; $i++ ) {
		if( $partsBeg[ $i ] ne $partsEnd[ $i ] ) { #if the number isn't the same between the begfile and endfile, this is the number we want to use with the looping
			$changingSet = $i;
		}#if
	}#if
	if( $changingSet == -1 ) {
		#die( "No file pattern recognized, can't download. Quitting ezpicspl.\n" );
		print "EZCode \"$_[ 0 ]\" is invalid, continuing on to next.\n"; #no longer dies on error, since we still have more codes to parse
		return;
	}#if

	$fileBeg = "";
	$fileBegR = "";
	$fileEnd = "";
	for( $i=0; $i<$changingSet; $i++ ) {
		$fileBeg .= $partsBeg[ $i ];
		$fileBegR .= $partsBeg[ $i ];
	}#for
	$startNum = $partsBeg[ $changingSet ];
	if( $changingSet < $#partsBeg ) {
		for( $i=$changingSet+1; $i<$#partsBeg + 1; $i++ ) {
			$fileEnd .= $partsBeg[ $i ];
		}#for
	}#if

	$endNum = $partsEnd[ $changingSet ];

	@splitFileName = split( /\./, $splitCode [ 0 ] );
	$extension = $splitFileName[ $#splitFileName ];

	if( $opt_r ) { #if user wants to rename rather than insert into subdir, according to the cmdline..
		$fileBeg = $splitCode[ 2 ];
	}

	$dirPrefix = "$splitCode[ 2 ]/";


	if( substr( $startNum, 0, 1 ) eq "0" ) { #if there's padding in the start, we're gonna need to pad whenever the filenum is of less digits than the ending num, so remember to do so
		$bShowPadding = 1;
	}#if

	return ( $bShowPadding, $startNum, $endNum, $fileBeg, $fileBegR, $fileEnd, $extension, $dirPrefix, $rdir );


}#EZParse


sub CleanUp {
	#args: [0]directory, [1]filename
	my @fileStat; #to hold file properties later when we check existence and filesize etc

	@fileStat = stat( "$_[ 0 ]$_[ 1 ]" );
	if( -e "$_[ 0 ]$_[ 1 ]" ) {	#check that it exists before deleting :) maybe wget fscked up
		if( $fileStat[ 7 ] < $opt_c * 1024 ) {	#if it's smaller than the specified cutoff, kill it.
			unlink( "$_[ 0 ]$_[ 1 ]" ) || print( "Couldn't remove file \"$_[ 0 ]$_[ 1 ]\".\n" );
			print "\"$_[ 0 ]$_[ 1 ]\" was smaller than $opt_c Kilobytes, so it was removed.\n";
		}#if
	}#if
}#CleanUp

sub PrintHelp() {
		print "Usage: EZPicspl [OPTIONS]... [EZCODES]...\n";
		print "Download from EZCodes via HTTP/FTP.\n\n";

		print "  --help\tPrint this help and exit.\n";

		print "  -c\t\tInt for cutoff size of downloads in Kilobytes. Default is 10.\n";

		print "  -r\t\tSwitch for whether to rename file by the last field in the\n";
		print "\t\tThe EZCode. The default state is off.\n";

		print "  -s\t\tSwitch for code testing, doesn't actually download.\n";

		print "  -t\t\tInt for the number of threads to use concurrently for\n";
		print "\t\tdownloading. The default value is 6.\n";

		print "  -p\t\tSwitch to download the file into the root of the current\n";
		print "\t\tdirectory. Meant to be used in conjunction with -r so as to\n";
		print "\t\tavoid overwriting existing files.\n";

		print "  -i\t\tString to specify a file to use as input. This text file\n";
		print "\t\tshould contain EZCodes only, one per line. This may be used\n";
		print "\t\talone or with additional EZCodes on the command line.\n";

		print "\nEZPicspl by Chris Peterson(derMoerder), $VERSION. Report bugs to\n";
		print "$EMAIL.\n";
		print "Website, subject to change: \thttp://www.geocities.com/ezpicspl\n";
		exit( 0 );
}#PrintHelp


#^1
		#my thoughts on the design here.. my options were to either parse all the codes before
		#downloading and (potentially)hog lots of memory with it, parse/download them one by one(which would
		#have made the code more complicated), or have it where Download() does the fork()ing. The
		#problem obvious with this last approach is that as the number of files left in the ezcode dwindles
		#to below the number of concurrent downloads, the script will not download the specified
		#number of files at once until it gets to the next code. 
		#...
		#now that i've babbled to myself sufficiently and thought the matter over, i think i'll go
		#with a version of the first method.. i'll parse all the codes one by one before downloading,
		#and store the information before using it in a list of a few necessary variables for
		#downloading. ie: padding, start and end numbers, etc and have the fork()/download loop(s)
		#just pull what they need out of that. Won't take up too much memory.. and better than parsing
		#the code a bajillion times, methinks.