Участник:ShurShur/getwikidumps.pl
Материал из Википедии — свободной энциклопедии
См. также: Участник:ShurShur/Dumps
#!/usr/bin/perl # getwikidumps 20060621 by Shurik # # This cript downloads all Wikimedia dumps from URL $base/$language$project # use Getopt::Long; #use AppConfig; use LWP::Simple; use strict; # standard projects (w/o language prefix) my @projects=( "wiki", # Wikipedia "wikibooks", # WikiBooks "wikinews", # WikiNews "wikiquote", # WikiQuote "wikisource", # WikiSource "wiktionary" # Wiktionary ); # language prefixes and references to projects my %languages=( "ru"=>\@projects, # Russian "commons"=>["wiki"], "meta"=>["wiki"], ); # files to skip my @skipfiles=( "pages-meta-history.xml.bz2" # Dup of better compressed pages-meta-history.xml.7z ); # base url my $base="http://download.wikimedia.org/"; # autoflush $|=1; sub help { my $me=$0; $me=~s#.*/##; print <<EOF; Usage: $me [OPTIONS] <project definitions>.... Project definitions: <lang>:<project> - e.g. ru:wikibooks for ru.wikibooks.org or meta:wiki for meta.wikimedia.org <lang> - same as <lang>:wiki <lang>: - same as <lang>:wiki Options: -h - this help Sample call: $me ru: ru:wikibooks meta commons:wiki - download dumps for ruwiki, ruwikibooks, metawiki & commonswiki EOF exit; } # parse @ARGV my $owndumplist=0; sub adddump { if(!$owndumplist) { $owndumplist=1; %languages=qw(); } my ($tmp)=@_; my ($lang,$project)=split /:/,$tmp; if($project=~/^$/) { $project="wiki"; } my $ref=$languages{$lang}; if(!$ref) { $ref=$languages{$lang}=[]; } push @$ref,$project; } GetOptions("-h"=>\&help,"<>"=>\&adddump) || help; sub getlastdate { my ($language,$project)=@_; my $pg=get "$base/$language$project/"; my @tmp=($pg=~m#href="(\d{8})/"#ig); return pop @tmp; } sub getfilelist { my ($language,$project,$date)=@_; my $pg=get "$base/$language$project/$date/"; my @tmp=($pg=~m#./$language$project-$date-(.+?)"#ig); return @tmp; } sub getfile { my ($language,$project,$date,$file)=@_; my $url="$base/$language$project/$date/$language$project-$date-$file"; my $loc="$language$project/$date/$language$project-$date-$file"; my $skip=0; if(-f $loc) { $skip=1; } else { for my $cfile (@skipfiles) { if($cfile eq $file) { $skip=1; last; } } } my $ok=0; if($skip) { print scalar(localtime).": [$language] $project [$date] $file "; } else { print scalar(localtime).": [$language] $project [$date] $file "; system "mkdir -p $language$project/$date/"; my $cmd="wget -c $url -O $loc.tmp 2>&1"; sleep 1; # prevent quick reconnect open P,"$cmd |"; while(<P>) { if(/^\d\d:\d\d:\d\d\s\((.+?)\)\s-\s\`.+?\'\ssaved\s\[(.+?)\]/) { print "($1)\n"; $ok=1; } if(/The\sfile\sis\salready\sfully\sretrieved;\snothing\sto\sdo/) { print "(exists)\n"; $ok=1; } } close P; } if($ok) { rename "$loc.tmp",$loc; } else { print "(skipped)\n"; } } sub getproject { my ($language,$project)=@_; my @date=getlastdate $language,$project; if($date[0]) { for my $date (@date) { printf scalar(localtime).": [$language] $project [$date] started\n"; my @list=getfilelist $language,$project,$date; for my $file(@list) { getfile $language,$project,$date,$file; } printf scalar(localtime).": [$language] $project [$date] completed\n"; } } else { printf scalar(localtime).": [$language] $project [not found]\n"; } } for my $language (keys %languages) { for my $project (@{$languages{$language}}) { getproject $language,$project; } }