#!/links/bin/perl -w # # part of the pitchfork reviews project: # builds sentence table and word_sentence joiner table # # loren jan wilson, november 2003 # use strict; use DBI; use Reviews; use Lingua::EN::Sentence; # pull paragraphs out one by one. for each paragraph, # 1. parse sentences out of the paragraph table # 1.2 find length of each sentence and calculate beginchar # 1.3 insert sentence into sentence table (id, beginchar, length, ->paragraph) # 2. remove quotes from sentence # 3. parse words from each sentence # 4. make word_sentence table my $dbh = make_db_handle; my $paragraphs = dbquery($dbh, qq/SELECT * FROM paragraph ORDER BY id/); # first drop all current entries in these tables. # there are so many interdependencies that the old data is useless. dbquery($dbh, qq/DELETE FROM sentence/); dbquery($dbh, qq/DELETE FROM word_sentence/); # lock tables. dbquery($dbh, qq/LOCK TABLES sentence WRITE, word_sentence WRITE, word READ/); my $paragraph_count; foreach my $paragraph_array (@$paragraphs) { my $paragraph_id = $$paragraph_array[0]; my $position = $$paragraph_array[1]; my $text = $$paragraph_array[2]; my $article_id = $$paragraph_array[3]; my $text_noquotes = $text; $paragraph_count++; if ($paragraph_count >= 5) { dbquery($dbh, qq/ UNLOCK TABLES /); dbquery($dbh, qq/ LOCK TABLES sentence WRITE, word_sentence WRITE, word READ /); $paragraph_count = 0; } ### 1. parse sentences ### my $sentences = Lingua::EN::Sentence::get_sentences($text); my $beginchar = 1; foreach my $sentence (@$sentences) { # 1.2 find length of each sentence my $length = length($sentence); # 1.3 insert sentence into sentence table dbquery($dbh, qq/ INSERT INTO sentence(beginchar, length, paragraph_id) VALUES($beginchar, $length, $paragraph_id) /); my $sentence_id = dbquery($dbh, qq/ SELECT last_insert_id() FROM sentence WHERE id = 1 LIMIT 1 /); # move the beginchar marker forward # the extra 1 is a space at the end of each sentence... # hopefully it's not ever more. $beginchar += $length + 1; ### 2. delete quotes ### my $word_re = qr/\w+'*\w*/; my $quote_re = qr/\s # one space .? # maybe one other character " # a quote mark \w.*? # word character followed by anything [\w,\.?!\)'] # word character or , or . or ? or ! or ) or ' " # ending quote mark \.? #maybe a period \)? # maybe a question mark too \s? # ending space if it's there /x; while ($sentence =~ /($quote_re)/) { my $quote = $1; # delete this quote from the sentence if ($quote =~ /\."?\s?$/) { $sentence =~ s/$quote_re/ ""\. /; } else { $sentence =~ s/$quote_re/ "" /; } } ### find words for word_sentence ### # for each quoteless sentence, add entries to word_sentence # (word_id, sentence_id, position, part_of_speech) my $sentence = $sentence." "; my $text_position = 1; # split sentence at spaces while ($sentence =~ /(.*?)\s/) { my $word = $1; # throw out punctuation; leave apostrophes and dashes $word =~ s/[^\w'-]//g; $word =~ s/_//g; $word =~ s/-{2,}//g; # don't do anything if we don't have a word left unless ($word) { $sentence =~ s/(.*?)\s//; next; } # find the word_id in word table my $word_id = dbquery($dbh, qq/SELECT id FROM word WHERE word like "$word" LIMIT 1/); # if the word isn't in the database, just move on. unless ($word_id) { $sentence =~ s/(.*?)\s//; next; } # put this word in word_sentence table dbquery($dbh, qq/INSERT INTO word_sentence (word_id, sentence_id, position) VALUES ($word_id, $sentence_id, $text_position)/); # delete this word from the sentence and move on $sentence =~ s/(.*?)\s//; $text_position++; } } } dbquery($dbh, qq/UNLOCK TABLES/);