## Factiva.Reutlead.filter.pl ## ## This Perl program is used for converting the lead sentences of Reuters stories downloaded ## via email from the Factiva news site to the TABARI input format. ## ## INPUT FILES ## ## $file_list = "filter.files"; ## Output and dates file names, followed by a list of the Factiva news files that are ## to be processed. These should be in chronological order ## ## Example: ## ## Liberia.Reuters.leads ## Liberia.Reuters.dates ## pitf.liberia.19891208-19900407.html ## pitf.liberia.19900407-19900603.html ## pitf.liberia.19900604-19900629.html ## pitf.liberia.19900629-19900723.html ## pitf.liberia.19900723-19900803.html ## pitf.liberia.19900803-19900814.html ## ## This is easily generated by using "ls > filter.files" and then editing to add the first ## two file names ## ## OUTPUT FILES ## ## FIRST FILE NAME: ("Liberia.Reuters.leads" in example) ## ## Lead sentences -- which in the Factiva/Reuters format is the first text delineated ## by

...

following the dateline -- in TABARI format; sentence is broken ## by hard-returns into lines of less than 80 characters. The header is of the ## form ## YYMMDD AFPF-ssss-01 AFPR0000... ## where ## YYMMDD Date -- this is extracted from the story date ## ssss Sequence of the story within the date, starting with 0001 ## lba ... Factiva serial number for the story ## ## SECOND FILE NAME: ("Liberia.Reuters.dates" in example) ## ## This shows the dates that were processed in YYMMDD order and the number of stories ## found for each date; this should be checked to insure that the files were actually ## in chronological order and no time intervals are missing or duplicated. ## ## The alert -SKIP- will occur whenever the last digits of the new date are ## not equal to those of the previous date plus 1 and the new day is not 01; this ## can occur either because of sparse data or a file sequencing error. ## ## FILTERING ## ## In comparison to NexisFilter.pl, this program does relatively little filtering. ## At present, the following filters are active ## ## 1. Story is skipped if the " - " delimiter for the dateline is missing; somehow ## Reuters occasionally manages to mess this up ## ## 2. Stories are skipped if the text following the dateline is less than 80 ## characters or more than 320 -- this gets rid of an assortment of junk ## (e.g. financial reports) that aren't standard stories ## ## 3. /\.</ is recognized as an alternative sentence terminator ## ## 4. There are stubs for "Special filters" that can be used to get rid of country- ## specific stories such as commodity reports ## ## PROGRAMMING NOTES: ## ## 1. In order to avoid accidentally over-writing data files when one forgets to ## include the ".leads" and ".dates" entries, the program exits with an error ## if it encounters .html in either of these names. This isn't fool-proof ## but will catch the most common error. ## ## SYSTEM REQUIREMENTS ## This program has been successfully run on perl under the Mac OS-10.3 system and a ## Compaq AlphaServer ES40 running Digital UNIX. ## ## Programmer: Philip A. Schrodt (schrodt@ku.edu) ## ## Modifications: ## 05-Jul-04: Initial version for AFP (FAST project) ## 15-Mar-05: Revised to handle Reuters for PITF ## 27-Jan-07: Revised to handle format shift to Windows line feeds ## #!/usr/local/bin/perl # ======== globals =========== # $file_list = "filter.files"; %months = ("Jan","01","Feb","02","Mar","03","Apr","04","May","05","Jun","06","Jul","07","Aug","08","Sep","09","Oct","10","Nov","11","Dec","12",); # hash for converting alpha months to numeric # ======== main program =========== # open(FDIR,$file_list) or die "Can\'t open list of input files $file_list; error $!"; #read output file and date file names, open same $file = ; chomp($file); if ($file =~ m/\.html/) { die "Leads output file name $file contains \'.html\'; this is not allowed";} # see note in intro open(FOUT,">$file") or die "Can\'t open output file $file; error $!"; $file = ; chomp($file); if ($file =~ m/\.html/) { die "Dates output file name $file contains \'.html\'; this is not allowed";} # see note in intro open(FDATE,">$file") or die "Can\'t open dates output file $file; error $!"; $lastdate = ""; $kc = 1; # count of events by date while ($file = ) { # file loop chomp($file); print "\nProcessing $file\n"; open(FIN,"$file") or die "Can\'t open input file $file; error $!"; print FDATE $file, "\n"; $dots = 0; ### main record processing loop ### # skip file header while ($line = ) { if ($line =~ m/<\/table>/) {last;} } while ($line = ) { #get serial number from story header $serialine = ""; $dateline = ""; while ($line = ) { if ($line =~ m/lba0000/i) {last} # identifying string for serial number (case changes in 2003) } $serialine = $line; # save line containing serial number # print FDATE $line; if ($line = ) { while ($line !~ m/ Words/) { $line = }; # really should do an EOF check here... $dateline = ; } # find lead while (($line !~ m/\(Reuters\) /) && ($line !~ m/Reuters /)) { # handle format shift if (!($line = )) {last}; } # get first paragraph $first = $line; do { if (!($line = )) {last}; $first .= $line; } until ($line =~ m/<\/P>/); # skip remainder of story: this could be modified later for full-story extraction $rest = ""; $line = ""; do { # $rest .= $line; if (!($line = )) {last}; } until ($line =~ m/<\/table>/); $rest .= $line; # skip tail of story $tail = ""; $line = ""; while ($line = ) { if ($line =~ m/<\/table>/) {last}; } # print $first,"\n"; # print length($first), " ",index($first," - "),"\n"; # $pause = ; if ($first !~ m/ - /) {next;} ## skip stories without a properly formatted dateline if ((length($first) - index($first," - ")) < 80 ) {next;} ## skip very short stories if ((length($first) - index($first," - ")) > 400 ) {next;} ## skip very long stories # if ($first =~ m/oilseed exports/) {next;} ## special filter: Algeria # if ($first =~ m/USDA said/) {next;} ## special filter: Algeria # if ($first =~ m/green robusta/) {next;} ## special filter # reformat date if ($dateline ne "") { # check that we found a legitimate date $serialine =~ m/lba\w+/i; # extract the serial number $serial = $&; # $MATCH $dateline =~ m/\d/; # find start of date $start = index($dateline,$&); # skip initial white space in line $monthno = $months{substr($dateline,$start+3,3)}; # convert month to numeric $yearno = substr($dateline,index($dateline,"
")-2,2); # get the 2-digit year $newdate = $yearno.$monthno.substr($dateline,$start,2); # extract the YYMMDD date if ($lastdate ne $newdate) { if ($lastdate ne "") { print FDATE "\t",$kc,"\n"; # output event count from previous day and new date $checkdate = $lastdate + 1; while (length($checkdate) < 6) {$checkdate = "0".$checkdate;} # handle Y2K if (($checkdate ne $newdate) && (substr($newdate,4,2) ne "01")) {print FDATE "-SKIP-\n";} } print FDATE $newdate; print "\n",$newdate; $dots = 0; $lastdate = $newdate; $kc = 1; $seqno = "0001"; # sequence number } else { ++$kc; ++$seqno; } print FOUT $newdate," REUT-$seqno-01 $serial\n"; # print record header ## write the records with maximum 80-character line lengths if ($first =~ m/\.</) { # get rid of these strange things... $first = substr($first, 0, index($first,"\.<") + 1); $first .= " \n"; } $first =~ s/

//; # get rid of the HTML markers $first =~ s/<\/P>//; $first =~ s/"/"/g; # convert " $first =~ s/\r//g; # remove internal Windows line feeds $first =~ s/\n/ /g; # replace internal Unix line feeds with blanks $first =~ s/\.\s+\n/\.\n/; # get rid of trailing white space $start = index($first," - ") + 3; # skip dateline $end = 0; while ($first =~ m/ /g) { # break the sentence at word endings if ((pos($first) - $start)>= 79) { print FOUT substr($first, $start, $end - $start + 1),"\n"; $start = $end + 1; } else { $end = pos ($first) - 1; } } print FOUT substr($first, $start, length($first) - $start + 1),"\n\n"; # output the remainder print '.'; # show signs of life... if (++$dots > 40) { print "\n"; $dots = 0;} } else {print "-";} # show that something was wrong } close(FIN); } # end file loop print FDATE "\t",$kc,"\n"; # print final event count close(FDATE); close(FOUT); print "\nProgram has finished!";