## Factiva.Reutlead.filter.pl ## ## This Perl program is used for converting the lead sentences of Reuters stories downloaded ## via email from the Factiva news site to the TABARI input format. ## ## INPUT FILES ## ## $file_list = "filter.files"; ## Output and dates file names, followed by a list of the Factiva news files that are ## to be processed. These should be in chronological order ## ## Example: ## ## Liberia.Reuters.leads ## Liberia.Reuters.dates ## pitf.liberia.19891208-19900407.html ## pitf.liberia.19900407-19900603.html ## pitf.liberia.19900604-19900629.html ## pitf.liberia.19900629-19900723.html ## pitf.liberia.19900723-19900803.html ## pitf.liberia.19900803-19900814.html ## ## This is easily generated by using "ls > filter.files" and then editing to add the first ## two file names ## ## OUTPUT FILES ## ## FIRST FILE NAME: ("Liberia.Reuters.leads" in example) ## ## Lead sentences -- which in the Factiva/Reuters format is the first text delineated ## by
...
following the dateline -- in TABARI format; sentence is broken ## by hard-returns into lines of less than 80 characters. The header is of the ## form ## YYMMDD AFPF-ssss-01 AFPR0000... ## where ## YYMMDD Date -- this is extracted from the story date ## ssss Sequence of the story within the date, starting with 0001 ## lba ... Factiva serial number for the story ## ## SECOND FILE NAME: ("Liberia.Reuters.dates" in example) ## ## This shows the dates that were processed in YYMMDD order and the number of stories ## found for each date; this should be checked to insure that the files were actually ## in chronological order and no time intervals are missing or duplicated. ## ## The alert -SKIP- will occur whenever the last digits of the new date are ## not equal to those of the previous date plus 1 and the new day is not 01; this ## can occur either because of sparse data or a file sequencing error. ## ## FILTERING ## ## In comparison to NexisFilter.pl, this program does relatively little filtering. ## At present, the following filters are active ## ## 1. Story is skipped if the " - " delimiter for the dateline is missing; somehow ## Reuters occasionally manages to mess this up ## ## 2. Stories are skipped if the text following the dateline is less than 80 ## characters or more than 320 -- this gets rid of an assortment of junk ## (e.g. financial reports) that aren't standard stories ## ## 3. /\.</ is recognized as an alternative sentence terminator ## ## 4. There are stubs for "Special filters" that can be used to get rid of country- ## specific stories such as commodity reports ## ## PROGRAMMING NOTES: ## ## 1. In order to avoid accidentally over-writing data files when one forgets to ## include the ".leads" and ".dates" entries, the program exits with an error ## if it encounters .html in either of these names. This isn't fool-proof ## but will catch the most common error. ## ## SYSTEM REQUIREMENTS ## This program has been successfully run on perl under the Mac OS-10.3 system and a ## Compaq AlphaServer ES40 running Digital UNIX. ## ## Programmer: Philip A. Schrodt (schrodt@ku.edu) ## ## Modifications: ## 05-Jul-04: Initial version for AFP (FAST project) ## 15-Mar-05: Revised to handle Reuters for PITF ## 27-Jan-07: Revised to handle format shift to Windows line feeds ## #!/usr/local/bin/perl # ======== globals =========== # $file_list = "filter.files"; %months = ("Jan","01","Feb","02","Mar","03","Apr","04","May","05","Jun","06","Jul","07","Aug","08","Sep","09","Oct","10","Nov","11","Dec","12",); # hash for converting alpha months to numeric # ======== main program =========== # open(FDIR,$file_list) or die "Can\'t open list of input files $file_list; error $!"; #read output file and date file names, open same $file =//; # get rid of the HTML markers $first =~ s/<\/P>//; $first =~ s/"/"/g; # convert " $first =~ s/\r//g; # remove internal Windows line feeds $first =~ s/\n/ /g; # replace internal Unix line feeds with blanks $first =~ s/\.\s+\n/\.\n/; # get rid of trailing white space $start = index($first," - ") + 3; # skip dateline $end = 0; while ($first =~ m/ /g) { # break the sentence at word endings if ((pos($first) - $start)>= 79) { print FOUT substr($first, $start, $end - $start + 1),"\n"; $start = $end + 1; } else { $end = pos ($first) - 1; } } print FOUT substr($first, $start, length($first) - $start + 1),"\n\n"; # output the remainder print '.'; # show signs of life... if (++$dots > 40) { print "\n"; $dots = 0;} } else {print "-";} # show that something was wrong } close(FIN); } # end file loop print FDATE "\t",$kc,"\n"; # print final event count close(FDATE); close(FOUT); print "\nProgram has finished!";