#!/links/bin/perl -w # # pitchfork review project: # this script makes the word table, quote table, and joiner table. # for words to paragraphs. # # loren jan wilson (started writing this in october 2003) # use strict; use DBI; use Reviews; # pull paragraphs out one by one. for each paragraph, # 1. find quotes and make quote table # 2. parse sentences out of the paragraph table # 3. parse words from each sentence # 4. make word_sentence table my $dbh = make_db_handle; my $paragraphs = dbquery($dbh, qq/SELECT * FROM paragraph ORDER BY id #LIMIT 630,80 /); # first drop all current entries in these tables. # there are so many interdependencies that the old data is useless. dbquery($dbh, qq/DELETE FROM word/); dbquery($dbh, qq/DELETE FROM quote/); dbquery($dbh, qq/DELETE FROM word_paragraph/); # lock all tables. # this is pretty much just a prayer that this script will go faster. # current running estimate without locks is 5 hours on the 5675 articles! dbquery($dbh, qq/LOCK TABLES word WRITE, quote WRITE, word_paragraph WRITE/); my $paragraph_count; foreach my $paragraph_array (@$paragraphs) { my $paragraph_id = $$paragraph_array[0]; my $position = $$paragraph_array[1]; my $text = $$paragraph_array[2]; my $article_id = $$paragraph_array[3]; my $text_noquotes = $text; $paragraph_count++; if ($paragraph_count >= 50) { dbquery($dbh, qq/ UNLOCK TABLES /); dbquery($dbh, qq/ LOCK TABLES word WRITE, quote WRITE, word_paragraph WRITE /); $paragraph_count = 0; } ### find quotes ### # this is not trivial. for each quote found, make an entry in quote # (id, beginchar, wordcount, contains_slash, ->paragraph_id) # then delete the quotes from the text. my $word_re = qr/\w+'*\w*/; my $quote_re = qr/\s # one space .? # maybe one other character " # a quote mark \w.*? # word character followed by anything [\w,\.?!\)'] # word character or , or . or ? or ! or ) or ' " # ending quote mark \.? #maybe a period \)? # maybe a close paren too \s? # ending space if it's there /x; my $quote_beginchar; while ($text_noquotes =~ /($quote_re)/) { my $quote = $1; #print "quote is $quote\n"; # 0. what was the beginchar of this match? my $quote_beginchar += length($`); # 1. do a wordcount of this quote my $wordcount = 0; my $wc_quote = $quote; while ($wc_quote =~ /($word_re)/) { $wordcount++; $wc_quote =~ s/$word_re//; } # 2. is there a slash in the quote? my $contains_slash = 0; ($contains_slash = 1) if ($quote =~ /\//); # 3. add this quote to the quote table #dbquery($dbh, qq/DELETE FROM quote # WHERE beginchar = $quote_beginchar # AND paragraph_id = $paragraph_id/); dbquery($dbh, qq/INSERT INTO quote (beginchar, wordcount, contains_slash, paragraph_id) VALUES ($quote_beginchar, $wordcount, $contains_slash, $paragraph_id)/); # delete this quote from the text if ($quote =~ /\."?\s?$/) { $text_noquotes =~ s/$quote_re/ ""\. /; } else { $text_noquotes =~ s/$quote_re/ "" /; } } ### find words for word_paragraph ### # for each quoteless paragraph, add entries to word_paragraph # (word_id, paragraph_id, position) # and to word (id, word) my $text_para = $text_noquotes." "; my $text_position = 1; # delete tabs...! $text_para =~ s/\t//g; # split paragraphs at spaces while ($text_para =~ /(.*?)\s/) { my $word = $1; # throw out punctuation but leave apostrophes and dashes intact $word =~ s/[^\w'-]//g; $word =~ s/_//g; $word =~ s/-{2,}//g; #$word =~ s/^.*?([\w']+).*?$/$1/; # don't do anything if we don't have a word left unless ($word) { $text_para =~ s/(.*?)\s//; next; } # put this word in word table my $word_id; $word_id = dbquery($dbh, qq/SELECT id FROM word WHERE word like "$word" LIMIT 1/); unless ($word_id) { #print "not in database: $word\n"; # the word isn't in the database, so put it in and get the id dbquery($dbh, qq/INSERT INTO word (word) VALUES ("$word")/); $word_id = dbquery($dbh, qq/SELECT last_insert_id() FROM word WHERE id = 1/); } else { #print "already there: $word\n"; } # put this word in word_paragraph table # (first delete anything that would conflict) #dbquery($dbh, qq/DELETE FROM word_paragraph WHERE paragraph_id = $paragraph_id AND position = $text_position/); dbquery($dbh, qq/INSERT INTO word_paragraph (word_id, paragraph_id, position) VALUES ($word_id, $paragraph_id, $text_position)/); # delete this word from the paragraph and move on $text_para =~ s/(.*?)\s//; $text_position++; } } dbquery($dbh, qq/UNLOCK TABLES/); # split each paragraph into sentences: # 0. test out the perl module Lingua::EN::Sentence.pm and see if # it works well enough # 1. if we see (lcase?)letter + period/quest/excl + space + capital/"/(/'/[ # /[A-Za-z][.!?] [A-Z"('\[]/ # 2. NO on common titles (Mr. Ms. Mrs.)... / Mr. [A-Z]/ # 3. NO on abbreviated names (M. Gira)... / [A-Z]\. [A-Z]/ # split each sentence into words: # 1. how do we deal with the majority of hyphenated words? would it be # more statistically useful to break them into separate words? how does # this change semantics? # 2. strip periods, exclamation points, question marks # 3. what do we do with band names? # 4. what do we do with quotes? # don't count quotes as words or as sentences... # delete them when counting those. # lyrics: stuff in quotes, but definitely long stuff that includes forward slashes. # /".*"/ and definitely /".*\/.*"/ # song titles: smaller stuff, capital letters on words