#!/usr/bin/perl # © 2010 Graham Shaw. # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. This file is offered as-is, # without any warranty. # This script filters a Wikipedia dump file to extract page titles. # It operates on an uncompressed MediaWiki XML dump file, taken from # stdin, and writes the titles to stdout, for example: # # ./extract-wikipedia-titles.pl < pages-articles.xml > titles.txt # # The titles are written one per line with XML encapsulation removed. # Pages in the namespaces 'Template', 'File', 'Category', 'Wikipedia' # and 'MediaWiki' are disregarded, as are pages that contain a # element. use IO::Handle; use XML::SAX::ParserFactory; use XML::SAX::Base; @Handler::ISA = qw(XML::SAX::Base); sub handle_title { my ($title) = @_; if ($title =~ /^(Template|File|Category|Wikipedia|MediaWiki):/) { # no action } else { print "$title\n"; } } sub Handler::new { my $class = shift; my $self = {}; bless $self,$class; return $self } sub Handler::start_element { my ($self, $el) = @_; if ($el->{LocalName} eq "page") { $self->{title}=undef; $self->{title_level}=0; $self->{redirect}=0; } elsif ($el->{LocalName} eq "title") { ++$self->{title_level}; } elsif ($el->{LocalName} eq "redirect") { ++$self->{redirect}; } } sub Handler::end_element { my ($self, $el) = @_; if ($el->{LocalName} eq "page") { if ((defined $self->{title}) && ($self->{redirect}==0)) { handle_title($self->{title}); } } elsif ($el->{LocalName} eq "title") { if ($self->{title_level}>0) { --$self->{title_level}; } } } sub Handler::characters { my ($self, $data) = @_; if ($self->{title_level} == 1) { $self->{title} = $self->{title}.$data->{Data}; } } binmode STDIN, ':utf8'; binmode STDOUT, ':utf8'; my $handler = Handler->new(); my $p = XML::SAX::ParserFactory->parser(Handler => $handler); my $fh = new IO::Handle; $fh->fdopen(fileno(STDIN),"r") || die "failed to open filehandle for STDIN"; $p->parse_file($fh);