#!/usr/bin/perl # © 2010 Graham Shaw. # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. This file is offered as-is, # without any warranty. # This script operates on an uncompressed MediaWiki XML dump file, # taken from stdin, and extracts the WikiText of each article: # # ./extract-mediawiki-pages < pages-articles # # Each page is written to a separate file below the current working # directory. The filename is obtained by converting the page title # to UTF-8, then calculating the MD5 checksum, then converting to # hex. The files are contained within two levels of subdirectory, # in order to limit the number of files per directory. The first # level corresponding to the first two characters of the filename # and the second level to the first four characters. use utf8; use IO::Handle; use MIME::Base64; use Digest::MD5 qw(md5_hex); use XML::SAX::ParserFactory; use XML::SAX::Base; @Handler::ISA = qw(XML::SAX::Base); sub Handler::new { my $class = shift; my $self = XML::SAX::Base->new; bless $self,$class; $self->{PageCount} = 0; return $self; } sub Handler::start_element { my ($self,$el) = @_; if ($el->{LocalName} eq 'page') { if ($self->{PageLevel}!=0) { die 'page within page'; } ++$self->{PageLevel}; $self->{Title}=''; $self->{Text}=''; } elsif ($el->{LocalName} eq 'title') { if ($self->{PageLevel}!=1) { die 'title outside page'; } if ($self->{TitleLevel}!=0) { die 'title within title'; } if ($self->{TextLevel}!=0) { die 'title within text'; } if ($self->{Title} ne '') { die 'second title element'; } ++$self->{TitleLevel}; } elsif ($el->{LocalName} eq 'text') { if ($self->{PageLevel}!=1) { die 'text outside page'; } if ($self->{TitleLevel}!=0) { die 'text within title'; } if ($self->{TextLevel}!=0) { die 'text within text'; } if ($self->{Text} ne '') { die 'second text element'; } ++$self->{TextLevel}; } } sub Handler::end_element { my ($self,$el) = @_; if ($el->{LocalName} eq 'page') { if ($self->{PageLevel}!=1) { die 'spurious end of page'; } if ($self->{TitleLevel}!=0) { die 'end of page within title'; } if ($self->{TextLevel}!=0) { die 'end of page within text'; } --$self->{PageLevel}; if ($self->{Title} eq '') { print STDERR "Warning: page without title\n"; return; } if ($self->{Text} eq '') { print STDERR "Warning: page without text\n"; return; } my $title = $self->{Title}; my $text = $self->{Text}; printf STDERR "Page %lu: %s\n",$self->{PageCount},$title; my $title_utf8 = $title; utf8::encode($title_utf8); my $filename = md5_hex($title_utf8); my $prefix1 = substr $filename,0,2; my $prefix2 = substr $filename,0,4; mkdir "$prefix1"; mkdir "$prefix1/$prefix2"; open FILE,'>:utf8',"$prefix1/$prefix2/$filename" || print STDERR "Error: failed to open $filename for writing"; print FILE $text; close FILE; ++$self->{PageCount}; } elsif ($el->{LocalName} eq 'title') { if ($self->{TitleLevel}!=1) { die 'spurious end of title'; } --$self->{TitleLevel}; } elsif ($el->{LocalName} eq 'text') { if ($self->{TextLevel}!=1) { die 'spurious end of text'; } --$self->{TextLevel}; } } sub Handler::characters { my ($self,$ch) = @_; if ($self->{TitleLevel}==1) { $self->{Title} .= $ch->{Data}; } elsif ($self->{TextLevel}==1) { $self->{Text} .= $ch->{Data}; } } binmode STDIN,':utf8'; binmode STDERR,':utf8'; my $handler = Handler->new(); my $p = XML::SAX::ParserFactory->parser(Handler => $handler); my $fh = new IO::Handle; $fh->fdopen(fileno(STDIN),"r") || die "failed to open filehandle for STDIN"; $p->parse_file($fh);