#!/usr/bin/perl # © 2010 Graham Shaw. # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. This file is offered as-is, # without any warranty. # This script fetches articles from a local copy of Wikipedia and # extracts (or tries to extract) the body of each article as plain # text. It takes a list of titles from stdin, and places the text # from each article in a separate file in the current working # directory: # # ./harvest-wikipedia-text.pl < titles.txt # # By default it will fetch from a base URL on localhost: # # http://localhost/mediawiki/index.php # # however an alternative location may be specified when the script # is invoked: # # ./harvest-wikipedia-text.pl http://hostname/path < titles.txt # # The text is written one paragraph per line with markup removed. # An attempt is used to remove material that is less suitable for # natural language analysis, such as references at the end of an # article. The heuristics used to do this are quite fragile, and # are tuned to the current layout of the English language edition # of Wikipedia. # # Files that already exist in the current working directory are not # overwritten, so if this script is stopped and restarted then it # will skip over articles that have already been fetched. An # activity log is written to stderr. use utf8; use LWP::UserAgent; use HTML::TreeBuilder; my $text = ''; my $paragraph = ''; sub break_paragraph { $paragraph =~ s/[[][[][^]]*[]][]]//g; $paragraph =~ s/^ *//; $paragraph =~ s/ *$//; if ($paragraph ne '') { $text = $text.$paragraph."\n"; $paragraph = ''; } } sub check_heading { my ($element) = @_; if (ref $element) { foreach my $child ($element->content_list()) { check_heading($child); } } else { if ($element =~ /^\s*references\s*$/i) { die "start of references"; } elsif ($element =~ /^\s*see\s+also\s*$/i) { die "start of see also"; } elsif ($element =~ /^\s*external\s+links\s*$/i) { die "start of external links"; } } } sub harvest { my ($element) = @_; foreach my $child ($element->content_list()) { if (ref $child) { my $tag = $child->tag(); my $id = $child->attr('id'); my $class = $child->attr('class'); if ($tag eq 'p') { break_paragraph(); harvest($child); } elsif (($tag eq 'li') || ($tag eq 'td')) { # no action } elsif ($tag eq 'script') { # no action } elsif ($tag eq 'sup') { if ($class ne 'reference') { harvest($child); } } elsif ($tag =~ /^h[0-9]$/) { check_heading($child); } elsif ($tag eq 'span') { if ($class eq 'editsection') { # no action } else { harvest($child); } } elsif ($tag eq 'a') { my $title = $child->attr('title'); if ($title =~ /^Template:/) { # no action } else { harvest($child); } } elsif ($tag eq 'div') { if ($id eq 'jump-to-nav') { # no action } elsif ($class =~ /^thumb /) { # no action } else { harvest($child); } } elsif ($tag eq 'table') { if ($class eq 'toc') { # no action } else { harvest($child); } } else { harvest($child); } } else { $child =~ s/\n/ /g; $paragraph = $paragraph.$child; } } } sub find_content { my ($element) = @_; foreach my $child ($element->content_list()) { if (ref $child) { my $tag = $child->tag(); if ($tag eq 'div') { my $id = $child->attr('id'); if ($id eq 'bodyContent') { harvest($child); } else { find_content($child); } } else { find_content($child); } } } } my $uri = "http://localhost/mediawiki/index.php"; if (defined $ARGV[0]) { $uri = $ARGV[0]; } mkdir ".tmp"; my $ua = LWP::UserAgent->new; while () { chomp; my $title = $_; my $leafname = $title; $leafname =~ s/\//!/g; if (-e "$leafname.txt") { printf STDERR "Skipping %s\n",$title; next; } my $response = $ua->get("$uri/$title"); if ($response->is_success) { my $content = $response->content; utf8::decode($content); my $tree = HTML::TreeBuilder->new; $tree->parse_content($content); eval { $text=''; find_content($tree); }; break_paragraph(); $tree = $tree->delete; printf STDERR "Fetched %s\n",$leafname; open FILE,'>',".tmp/$leafname.txt"; binmode FILE, ':utf8'; print FILE $text; close FILE; rename ".tmp/$leafname.txt","$leafname.txt"; } else { printf STDERR "Failed to fetch %s (%s)\n",$title,$response->status_line; } }