#!/usr/bin/perl # © 2010 Graham Shaw. # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. This file is offered as-is, # without any warranty. # This script parses WikiText (using MediaWiki markup) to harvest # the plain text within it. There are currently four phases to this # process: # 1. Handle tags provided by the Cite extension (refs and references). # 2. Expand templates and remove comments. # 3. Handle layout-dependent markup. # 4. Handle layout-independent markup. # # No attempt has been made to duplicate the internal workings of the # MediaWiki parser, therefore differences in behaviour are expected # (particularly in the handling of corner cases). Known issues # include the following: # - Templates are not expanded: instead they are removed. This is # OK for block-level structures such as infoboxes, but problematic # for templates that are used inline within paragraphs. # - Similarly for variables and character entities. # - Horizontal rules are not detected. # # Input and output directories are specified using the -i and -o # options respectively. The name of each file is the hexadecimal # MD5 sum of its title expressed as utf-8. There are two levels # of prefix directories, one matching the first two characters # of the filename and one matching the first four characters. # A list of titles to be parsed should be fed to stdin. # # A copy of the MediaWiki Names.php file is needed. This should be # placed in the current working directory, or alternatives, its # location specified by means of the -l option. use feature "switch"; use utf8; use Getopt::Std; use MIME::Base64; use Digest::MD5 qw(md5_hex); my $pat_id = "[A-Za-z][A-Za-z0-9-]*"; my $pat_attrlist = "(\\s*(${pat_id}\\s*=\\s*)?(\"[^\"]*\"|'[^']*'|[A-Za-z0-9_-]+))*"; my $pat_stag = "<(?${pat_id})${pat_attrlist}\\s*(?\/)?\\s*>"; my $pat_etag = "<\/(?${pat_id})\\s*>"; my $pat_ent = "&(?${pat_id}|#x?[0-9A-Fa-f]+);"; my %opts = (); getopts('i:l:o:',\%opts); my $indir = $opts{'i'}; my $outdir = $opts{'o'}; if ((!defined $indir) || ($indir eq '')) { $indir = '.'; } if ((!defined $outdir) || ($outdir eq '')) { $outdir = '.'; } if ($indir eq $outdir) { die "input and output directories must differ"; } my $lc_pathname = $opts{'l'}; if ((!defined $lc_pathname)||($lc_pathname eq '')) { $lc_pathname = 'Names.php'; } my %langcodes = (); open FILE,'<',$lc_pathname || die "failed to open $lc_pathname"; while () { chomp; s/\s*#.*$//; if (/^\s*(['"])([^']+)\1\s*=>/) { $langcodes{$2}=1; } } close FILE; sub parse_extensions { my $rawtext = shift; my $text = ''; my $index = 0; while ($rawtext =~ m/(${pat_stag})/go) { my $token = $1; my $stagid = $+{stagid}; my $closed = $+{closed}; $text .= substr $rawtext,$index,pos($rawtext)-$index-length($token); $index = pos $rawtext; if ($stagid =~ /^(gallery|math|ref)$/) { if ($closed eq '') { if ($rawtext =~ m/(<\/$stagid\s*>)/g) { $index = pos $rawtext; } else { $index = undef; last; } } } elsif ($stagid eq 'reference') { # no action } else { $text .= $token; } } if (defined $index) { $text .= substr $rawtext,$index; } return $text; } sub preprocess { my $rawtext = shift; my $text = ''; my @stack = (); my $close = undef; my $index = 0; while ($rawtext =~ m/(\n?)(|\{\{\{?|\}\}\}?)/g) { my $newline1 = $1; my $token = $2; $text .= substr $rawtext,$index,pos($rawtext)-$index-length($token); $index = pos $rawtext; if ($token eq $close) { my $item = pop @stack; ($close,my $ptext,my $newline2) = @$item; given ($token) { when ('-->') { # comment: discard text } when ('}}') { # macro } when ('}}}') { # variable } } $text = $ptext; } else { given ($token) { when (''; $text = ''; } when ('{{') { push @stack,[$close,$text,$newline1]; $close = '}}'; $text = ''; } when ('{{{') { push @stack,[$close,$text,$newline1]; $close = '}}}'; $text = ''; } default { $text .= $2; } } } } if (defined $index) { $text .= substr $rawtext,$index; } while (my $item = pop @stack) { ($close,my $ptext,my $newline2) = @$item; $text = $ptext . $text; } return $text; }; sub parse_lines { my $rawtext = shift; my @lines = split /\n/,$rawtext; my @paragraphs = (); my $paragraph = ''; my $in_table = 0; foreach my $line (@lines) { if ($line =~ /^\s*\{\|/) { # Start table $in_table = 1; } elsif ($line =~ /^\s*\|\}/) { # End table $in_table = 0; } elsif ($in_table) { # Ignore table markup } elsif ($line =~ /^$/) { # An empty line is a paragraph break. if ($paragraph ne '') { push @paragraphs,$paragraph; $paragraph = ''; } } elsif ($line =~ /^[=#*: ]/) { # Ignore line if it is: # - a heading, # - a list item, # - indented, or # - preformatted # on the grounds that while these # might contain usable text, they # are less likely to than simple # paragraphs. } else { if ($paragraph ne '') { $paragraph .= ' '; } $paragraph .= $line; } } if ($paragraph ne '') { push @paragraphs,$paragraph; $paragraph = ''; } return join "\n",@paragraphs; } sub unwind { my ($stack,$text,$close_token) = @_; my @old_stack = @$stack; my $old_text = $text; while (my $item = pop @$stack) { my ($open_token,$up_text) = @$item; if ($open_token =~ /^<(table|ul|ol)$/) { # ignore the content of these elements $text = ''; } elsif ($open_token eq '[') { $text =~ s/^[^ ]+ //; } elsif ($open_token eq '[[') { my $langcode = undef; if ($text =~ /^([^:]+):/) { $langcode = $1; } if ($text =~ /^:?(Category|File|Image):/) { # Ignore category markers and images $text = ''; } elsif (exists $langcodes{$langcode}) { # Ignore interlanguage links $text = ''; } elsif ($text =~ /[|]([^|]+)$/) { $text = $1; } } elsif ($open_token eq '{|') { # Ignore the content of tables. $text = '' } $text = $up_text . $text; if ($open_token eq $close_token) { return $text; } } if (defined $close_token) { @$stack = @old_stack; $text = $old_text; } return $text; } sub parse_tags { my $rawtext = shift; my $text = ''; my @stack = (); my $index = 0; while ($rawtext =~ m/(${pat_stag}|${pat_etag}|${pat_ent}|\[\[?|\]\]?|''('('')?)?)/go) { my $token = $1; my $stagid = $+{stagid}; my $closed = $+{closed}; my $etagid = $+{etagid}; my $entid = $+{entid}; $text .= substr $rawtext,$index,pos($rawtext)-$index-length($token); $index = pos $rawtext; if ($stagid ne '') { if ($closed) { # Empty elements are ignored. } else { push @stack,['<'.$stagid,$text]; $text = ''; } } elsif ($etagid ne '') { $text = unwind \@stack,$text,'<'.$etagid; } elsif ($token eq '[') { push @stack,[$token,$text]; $text = ''; } elsif ($token eq ']') { $text = unwind \@stack,$text,'['; } elsif ($token eq '[[') { push @stack,[$token,$text]; $text = ''; } elsif ($token eq ']]') { $text = unwind \@stack,$text,'[['; } elsif ($token =~ /^''+$/) { # Italic and bold markers are ignored. } elsif (defined $entid) { # Not yet implemented } } if (defined $index) { $text .= substr $rawtext,$index; } $text = unwind \@stack,$text,undef; return $text; } binmode STDIN,':utf8'; binmode STDERR,':utf8'; my $count=0; while () { chomp; my $title = $_; my $title_utf8 = $title; utf8::encode($title_utf8); my $filename = md5_hex($title_utf8); my $prefix1 = substr $filename,0,2; my $prefix2 = substr $filename,0,4; print STDERR "Page $count: $filename ($title)\n"; eval { open(INFILE,'<',"$indir/$prefix1/$prefix2/$filename") || die "failed to open $filename for reading"; binmode INFILE,':utf8'; my @lines = ; close INFILE; next if $lines[0] =~ /^#REDIRECT/; my $text = join '',@lines; $text = parse_extensions $text; $text = preprocess $text; $text = parse_lines $text; $text = parse_tags $text; mkdir "$outdir/$prefix1"; mkdir "$outdir/$prefix1/$prefix2"; open(OUTFILE,'>',"$outdir/$prefix1/$prefix2/$filename") || die "failed to open $filename for writing"; binmode OUTFILE,':utf8'; print OUTFILE $text; close OUTFILE; }; if ($@) { print STDERR "Error: $@"; } ++$count; }