#!/links/bin/perl # # pitchfork review parser, # parses reviews from html format and puts them in the database # loren jan wilson (started 2003-1013) # use strict; use DBI; use Reviews; our $DEBUG = 1; my $usage = <<"EOF"; this script sticks a pitchfork review into the sql database. usage: > pitchfork_review_parser.pl review.txt EOF # looking for these strings for the following seven variables: # 1_artist: Momus # 2_title: Oskar Tennis Champion # 3_label: American Patchwork; 2003 # 4_rating: 2.1 # 5_author: Michael Idov my ($band, $title, $uncut_label, $label, $year, $rating, $author, $date); my $publication = "pitchfork"; # the text goes in here, one paragraph for each scalar my @paragraphs; # used to build each paragraph my $paragraph = ""; my $infile = $ARGV[0] || die $usage; open(INFILE, "< $infile") or die "can't open $infile: $!"; HEADER: while () { my $line = $_; chomp $line; $line =~ s/ //g; # "header ends here" marks the beginning of the text. last HEADER if ($line =~ /HEADER ENDS HERE/); last HEADER if ($line =~ /
/); # 1_artist: Momus if ($line =~ /^1_artist: (.*)$/) { $band = $1; } # 2_title: Oskar Tennis Champion if ($line =~ /^2_title: (.*)$/) { $title = $1; } # 4_rating: 2.1 if ($line =~ /^4_rating: (.*)$/) { $rating = $1; } # 5_author: Michael Idov if ($line =~ /^5_author: (.*)$/) { $author = $1; } # 3_label: American Patchwork; 2003 # have to split the label from the year. if ($line =~ /^3_label: (.*)$/) { $uncut_label = $1; # sometimes there are two labels...ignore the second one if ($uncut_label =~ /(.*?)[:,;] (\d\d\d\d).*/) { $label = $1; $year = $2; } else { $label = $uncut_label; $year = "NULL"; } } } BODY: while() { # 1. make sure lines are complete by joining until we hit # a blank line. # 2. join each line with a " " just in case. # 3. delete everything inside <> in the line (HTML code) # 4. if there's anything left, it's a paragraph. # 5. turn all double spaces into single spaces to offset #2. # 6. cut off leading and trailing spaces my $line = $_; chomp $line; $line =~ s/ //g; # stop reading the file when we get to the footer last BODY if ($line =~ /REVIEW FOOTER STARTS HERE/); # take off some of the HTML code in case this line is only html. $line =~ s/<.*?>//g; # FIX: some very old reviews have tabs in them as paragraph # markers. this is very bad and throws off this algorithm. if ($line =~ /^\s*$/) { # if this line is whitespace or less, # finish up this paragraph and go onto the next one. # delete HTML code. $paragraph =~ s/<.*?>//g; # cut off leading and trailing spaces $paragraph =~ s/^ +//; $paragraph =~ s/ +$//; # turn all multiple spaces into single spaces $paragraph =~ s/ +/ /g; # if this paragraph is empty, don't add it to the array if ($paragraph =~ /^\s*$/) { $paragraph = ""; next BODY; # get rid of the first four lines at beginning } elsif ($paragraph =~ /^$band $title \[$label/) { $paragraph = ""; next BODY; # and get rid of the author's name at the end... } elsif ($paragraph =~ /^-$author/) { # get the date (this is the only place to do it) if ($paragraph =~ /$author, (.*)$/) { $date = $1; } $paragraph = ""; next BODY; } else { # not empty, so add it to the array of paragraphs # get rid of the author's name at the end... push (@paragraphs, $paragraph); $paragraph = ""; next BODY; } } else { # there's something on this line, so # join this line to the current paragraph $paragraph = "$paragraph $line"; } } close(INFILE) or die "can't close $infile: $!"; # we just finished reading the file, so put everything in the database. # we have $band, $title, $label, $year, $rating, $author, @paragraphs # remove superfluous spaces, just in case foreach my $var ($band, $title, $label, $rating, $author) { die "Can't add article $infile: $band:$title:$label:$rating:$author\n" if (! defined $var); $var =~ s/^ +//; $var =~ s/ +$//; $var =~ s/ +/ /g; } die "Can't add article $infile: no paragraphs!\n" if (! defined $paragraphs[0]); # convert rating to 100-point scale (multiply by 10 in pitchfork's case) $rating = $rating * 10; # print some stuff #print "band: $band\ntitle: $title\nlabel: $label\nyear: $year\nrating: $rating out of 100\nauthor: $author\n"; #my $i = 1; #foreach my $para (@paragraphs) { # print "paragraph $i:\n$para\n"; # $i++; #} ### INPUT INTO DATABASE ### # if there is already an entry for this publication+album+year in the # database, replace it with this one, although make sure to output # a warning when we do it. # (do one select here to try to find the same guy, give warning if we do # and use the id numbers to insert the new stuff.) # ask the database for what we need. we will need to come across: # 1. publication.id (or add publication) # 2. rating.id (or add rating) # 3. author.id (or add author) # 4. band.id (or add band) # 5. label.id (or add label) # then add: # album (name, year, ->band_id, ->label_id # article (date, ->author_id, ->publication_id, ->album_id, ->rating_id) # paragraphs (position, text, ->article_id) # escape all quotes on the names foreach my $name ($band, $title, $label, $author) { $name =~ s/"/\\"/g; } # database handle for mysql my $dbh = make_db_handle; # we will need to munge the date in order to put it in the database in # "DATE" format. (need: YYYY-MM-DD) $date = clean_pitchfork_date($date); # remember that on earlier articles, the date will be null since it's not given. ($date = "\"$date\"") if (defined $date); ($date = "NULL") if (! defined $date); #1. find publication.id or quit if it's not there. my $publication_id = dbquery($dbh, qq/SELECT id FROM publication WHERE name like "$publication"/); die "publication id not found!" unless defined $publication_id; #2. find rating.id or add rating to database. my $rating_id = dbquery($dbh, qq/SELECT id FROM rating WHERE rating = $rating/); if (! defined $rating_id) { dbquery($dbh, qq/INSERT INTO rating (id, rating) VALUES ($rating, $rating)/); $rating_id = $rating; } elsif (ref $rating_id) { die "more than one rating for $rating in the database!"; } # 3. find author.id or add author. my $author_id = dbquery($dbh, qq/SELECT id FROM author WHERE name like "$author"/); if (! defined $author_id) { dbquery($dbh, qq/INSERT INTO author (name) VALUES ("$author")/); $author_id = dbquery($dbh, qq/SELECT last_insert_id() FROM author WHERE name like "$author" LIMIT 1/); } # 4. band.id (or add band) my $band_id = dbquery($dbh, qq/SELECT id FROM band WHERE name like "$band"/); if (! defined $band_id) { dbquery($dbh, qq/INSERT INTO band (name) VALUES ("$band")/); $band_id = dbquery($dbh, qq/SELECT last_insert_id() FROM band WHERE name like "$band" LIMIT 1/); } # 5. label.id (or add label) my $label_id = dbquery($dbh, qq/SELECT id FROM label WHERE name like "$label"/); if (! defined $label_id) { dbquery($dbh, qq/INSERT INTO label (name) VALUES ("$label")/); $label_id = dbquery($dbh, qq/SELECT last_insert_id() FROM label WHERE name like "$label" LIMIT 1/); } # add album (name, year, ->band_id, ->label_id) unless it already exists... my $album_id = dbquery($dbh, qq/SELECT id FROM album WHERE band_id = $band_id AND label_id = $label_id AND name like "$title"/); if (! defined $album_id) { dbquery($dbh, qq/INSERT INTO album (name, year, band_id, label_id) VALUES ("$title", $year, $band_id, $label_id)/); $album_id = dbquery($dbh, qq/SELECT last_insert_id() FROM album WHERE name like "$title" LIMIT 1/); } # add article (date, ->author_id, ->publication_id, ->album_id, ->rating_id). # if there is already an article exactly like this, delete the old one! # you will have to delete paragraphs for it as well... my $article_id = dbquery($dbh, qq/SELECT id FROM article WHERE author_id = $author_id AND publication_id = $publication_id AND album_id = $album_id AND rating_id = $rating_id/); if (defined $article_id) { # there is already an article like this...delete it and its paragraphs dbquery($dbh, qq/DELETE FROM article WHERE author_id = $author_id AND publication_id = $publication_id AND album_id = $album_id AND rating_id = $rating_id/); dbquery($dbh, qq/DELETE FROM paragraph WHERE article_id = $article_id/); } # create the article dbquery($dbh, qq/INSERT INTO article (date, author_id, publication_id, album_id, rating_id) VALUES ($date, $author_id, $publication_id, $album_id, $rating_id)/); $article_id = dbquery($dbh, qq/SELECT last_insert_id() FROM article WHERE author_id = $author_id AND publication_id = $publication_id AND album_id = $album_id AND rating_id = $rating_id LIMIT 1/); # add paragraphs (position, text, ->article_id) my $i = 1; foreach my $paragraph (@paragraphs) { $paragraph =~ s/"/\\"/g; dbquery($dbh, qq/INSERT INTO paragraph (position, text, article_id) VALUES ($i, "$paragraph", $article_id)/); $i++; } print "added $band:$title by $author on $date to the database\n"; sub clean_pitchfork_date { my $date = $_[0]; #like "April 3rd, 2003" return undef if (! $date); # get three elements... my ($month, $day, $year); $date =~ /.*?(\w+) (\d+).*? (\d\d\d\d).*?/; $month = $1; $day = $2; $year = $3; return undef if (! defined $month); return undef if (! defined $day); return undef if (! defined $year); # get the number for the month my @months = qw(jan feb mar apr may jun jul aug sep oct nov dec); my %months; my $i = 1; foreach my $name (@months) { my $month_number = sprintf("%02d", $i); $months{$name} = $month_number; $i++; } $month = lc($month); $month = substr($month, 0, 3); $month = $months{$month}; # zero-pad the day $day = sprintf("%02d", $day); #return the stuff my $return = "$year-$month-$day"; unless ($return =~ /\d\d\d\d-\d\d-\d\d/) { die "couldn't parse $date: $year-$month-$day"; } return $return; }