## Factiva.postfilter.pl ## ## This Perl program is used for post-filtering of stories downloaded via ## email from the Factiva news site. It is used to *remove* stories based ## on sets of words and phrases found in the article. The output is a ## slightly reformatted version of the original file. ## ## INPUT FILES ## ## $file_list = "files.list"; ## List of the Factiva news file that are to be processed. These files are ## the HTML documents produced by the "email" option on the Factiva site. ## ## $discard_list = "discard.words"; ## This is the list of words and phrases used to determine whether a story ## should be discarded. Each word or phrase is on a separate line; the file ## has and sections. ## ## Example: ## ## ## Amin ## war on terror ## nuclear weapons ## accident ## Palestin\w* ## ## Gates ## accident ## malaria ## al-Qaeda ## ## Any line that *begins* with '#' is treated as a comment and skipped. ## ## Words and phrases in the section are checked in the headline and first ## paragraph; words and phrases in the section are checked in the remainder ## of the story. If you want a phrase to be checked anywhere in the story, include ## it in both sections. ## ## The phrases in this file are actually Perl regular expressions, so the following ## characters must be "escaped" with a backslash \ ## ## + / . * ^ $ ( ) [ { | \ ## ## So, for example, if you want to discard on the phrase ## ## Congo (Brazzaville) ## ## this must be written as ## ## Congo \(Brazzaville\) ## ## In exchange for this slight inconvenience, the full power of perl regular ## expressions is available, so for example ## ## Palestin\w* ## ## means "Palestin" followed by any combination of letters followed by a blank ## (the blank is probably not visible on your screen...), so this would match ## ## Palestine ## Palestinian ## Palestinians ## ## Unless you are already familiar with regular expressions, it is not worth the ## trouble learning for this project; just use regular words. ## ## Words and phrases are case sensitive (e.g. Amin and amin are different) and if ## you only want an exact word to match, be sure to end it (and, if you wish, begin ## it) with a blank; otherwise you might get some unexpected matches. For example, ## amin (as in former Ugandan dictator Idi Amin) matches ## ## famine ## roaming ## examine ## examination ## ## If you capitalized Amin, you wouldn't get these matches, but the general principle ## still holds. ## ## Remember, BLANK SPACE IS SIGNIFICANT and is part of the pattern. This leads to two ## cautions based on our experience thus far with this program: ## ## 1. Be sure to put only a single leading blank in front of a word (unless you ## deliberately mean to have more than one, e.g. at the beginning of a sentence). ## ## 2. If a line containing a single blank is in the file, it will match every story, ## and you will see nothing in the .filt output files. ## ## If you are using BBEdit, you might want to turn on the "Show Invisibles" and "Show ## Spaces" options to double-check your files. ## ## ## OUTPUT FILES ## ## The stories are written to a file that has the same name as the input file except ## that ".html" is changed to ".filt". A new header indicates that the file has ## been processed by the program, and shows the phrases used to determine discards. ## Otherwise the file generally has the same format as the original except that the ## header (red), lead paragraph (green) and body (blue) have been color-coded. ## ## SYSTEM REQUIREMENTS ## This program has been successfully run on Macintosh (MacPerl 5.6.1r2; Mac OS 9) system; ## there are no Mac-specific features and the program should run without modifications ## on other operating systems. ## ## Programmer: Philip A. Schrodt (schrodt@ku.edu) ## ## Modifications: ## Initial version: 21-May-04 ## #!/usr/local/bin/perl # ======== globals =========== # $file_list = "filter.files"; $discard_list = "discard.words"; $output_suffix = "filt"; # ======== main program =========== # # read discard list $kb = 0; open(FDIR,$discard_list) or die "Can\'t open list of discard words $discard_list; error $!"; $line = ; # discard first line while ($line = ) { # read head/lead discards chomp($line); if ($line =~ m/^#/) {next;} if ($line =~ m//) {last;} $discardL[$kb] = $line; # discards in headline and lead ++$kb; } $nwordsL = $kb; $kb = 0; while ($line = ) { # read discards chomp($line); if ($line =~ m/^#/) {next;} $discardB[$kb] = $line; ++$kb; } $nwordsB = $kb; close(FDIR); open(FDIR,$file_list) or die "Can\'t open list of input files $file_list; error $!"; while ($file = ) { # file loop chomp($file); print "\nProcessing $file\n"; open(FIN,"$file") or die "Can\'t open input file $file; error $!"; $outfile = $file; $outfile =~ s/\.html/\.$output_suffix/; open(FOUT,">$outfile") or die "Can\'t open output file $outfile; error $!"; $dots = 0; ### main record processing loop ### # copy file header while ($line = ) { print FOUT $line; if ($line =~ m/Next<\/font>/) {last;} } while ($line = ) { print FOUT $line; if ($line =~ m/<\/table>/) {last;} } # insert discardL list print FOUT "

--- File filtered by PITF_FILTER.PL ---<\/H2>\n

Headline and lead paragraph discard list:<\/H3>\n"; for ($kb = 0; $kb < $nwordsL; ++$kb) { print FOUT $discardL[$kb],"
\n"; } print FOUT "
\n

Body text discard list:<\/H3>\n"; for ($kb = 0; $kb < $nwordsB; ++$kb) { print FOUT $discardB[$kb],"
\n"; } print FOUT "
\n"; while ($line = ) { #get story header $header = ""; $line = ""; do { $line =~ s/)) {last}; } until ($line =~ m/

/); # get first paragraph $first = $line; do { $first .= $line; if (!($line = )) {last}; } until ($line =~ m/<\/P>/); $first .= $line; # get remainder of story $rest = ""; $line = ""; do { $line =~ s/)) {last}; } until ($line =~ m/<\/table>/); $rest .= $line; # get tail of story $tail = ""; $line = ""; while ($line = ) { $tail.= $line; if ($line =~ m/<\/table>/) {last}; } # check for discards $isokL = 1; $isokB = 1; for ($kb = 0; $kb < $nwordsL; ++$kb) { if ($header =~m/$discardL[$kb]/) { $isokL = 0; last}; if ($first =~m/$discardL[$kb]/) { $isokL = 0; last}; } if ($isokL) { for ($kb = 0; $kb < $nwordsB; ++$kb) { if ($rest =~m/$discardB[$kb]/) { $isokB = 0; last}; } } if ($isokL * $isokB) { print FOUT $header; print FOUT "\n",$first,"\n"; print FOUT "\n",$rest,"\n"; print FOUT $tail; } else { if ($isokB) { print "Discard lead: ",$discardL[$kb],"\n";} else { print "Discard body: ",$discardB[$kb],"\n";} } print '.'; # show signs of life... if (++$dots > 40) { print "\n"; $dots = 0;} } print FOUT "\n<\/body>\n<\/html>\n"; close(FIN); close(FOUT); } # end file loop close(FDIR); print "\nProgram has finished!";