## Factiva.filter.pl ## ## This Perl program is used for converting the lead sentences of AFP stories downloaded ## via email from the Factiva news site to the TABARI input format. ## ## INPUT FILES ## ## $file_list = "files.list"; ## Output and dates file names, followed by a list of the Factiva news files that are ## to be processed. These should be in chronological order ## ## $discard_list = "discard.words"; ## This is the list of words and phrases used to determine whether a story ## should be discarded. Each word or phrase is on a separate line; the file ## has and sections. ## ## Example: ## ## AFPL.AMJ04.leads ## AFPL.AMJ04.dates ## F2.1 ## F2.2 ## F2.3 ## F2.4 ## F2.5 ## ## OUTPUT FILES ## ## FIRST FILE NAME: ("AFPL.AMJ04.leads" in example) ## ## Lead sentences -- which in the Factiva/AFP format is the first text delineated ## by

...

following the dateline -- in TABARI format; sentence is broken ## by hard-returns into lines of less than 80 characters. The header is of the ## form ## YYMMDD AFPF-ssss-01 AFPR0000... ## where ## YYMMDD Date -- this is extracted from the Factiva serial number ## ssss Sequence of the story within the date, starting with 0001 ## AFPR... Factiva serial number for the story ## ## SECOND FILE NAME: ("AFPL.AMJ04.dates" in example) ## ## This shows the dates that were processed in YYMMDD order and the number of stories ## found for each date; this should be checked to insure that the files were actually ## in chronological order and no time intervals are missing or duplicated. ## ## The alert ---> CHECK DATE will occur whenever the last digits of the new date are ## not equal to those of the previous date plus 1; this will occur correctly when ## there is a change of month but probably is due to a sequencing error otherwise. ## ## PROGRAMMING NOTES: ## ## 1. This is still a bit of a work-in-progress that was created quickly to deal with ## the fact that AFP disappeared without notice from NEXIS and we suddenly had to ## get it from Factiva. ## ## 2. Unlike its predecessor Nexisfilter.pl, this program does no internal filtering ## of the stories. However, Factiva does a *much* better job of this prior to the ## download that was done by NEXIS, so it appears less necessary. ## ## SYSTEM REQUIREMENTS ## This program has been successfully run on Macintosh (MacPerl 5.6.1r2; Mac OS 9) system; ## there are no Mac-specific features and the program should run without modifications ## on other operating systems. ## ## Programmer: Philip A. Schrodt (schrodt@ku.edu) ## ## Modifications: ## Initial version: 05-Jul-04 ## #!/usr/local/bin/perl # ======== globals =========== # $file_list = "filter.files"; # ======== main program =========== # open(FDIR,$file_list) or die "Can\'t open list of input files $file_list; error $!"; #read output file and date file names, open same $file = ; chomp($file); open(FOUT,">$file") or die "Can\'t open output file $file; error $!"; $file = ; chomp($file); open(FDATE,">$file") or die "Can\'t open dates output file $file; error $!"; $lastdate = ""; $kc = 1; # count of events by date while ($file = ) { # file loop chomp($file); print "\nProcessing $file\n"; open(FIN,"$file") or die "Can\'t open input file $file; error $!"; $dots = 0; ### main record processing loop ### # skip file header while ($line = ) { if ($line =~ m/Next<\/font>/) {last;} } while ($line = ) { if ($line =~ m/<\/table>/) {last;} } while ($line = ) { #get serial number from story header $serialine = ""; do { if (!($line = )) {last}; if ($line =~ m/AFPR0000/) { $serialine = $line;} # save line containing serial number } until ($line =~ m/

/); # get first paragraph $first = ""; $line = ""; do { $first .= $line; if (!($line = )) {last}; } until ($line =~ m/<\/P>/); $first .= $line; # skip remainder of story: this could be modified later for full-story extraction $rest = ""; $line = ""; do { $line =~ s/)) {last}; } until ($line =~ m/<\/table>/); $rest .= $line; # get tail of story $tail = ""; $line = ""; while ($line = ) { if ($line =~ m/<\/table>/) {last}; } # reformat date if ($serialine ne "") { # check that we found a legitimate date $serialine =~ m/AFPR\w+/; # extract the serial number $serial = $&; $newdate = substr($serial,10,6); # extract the YYMMDD date if ($lastdate ne $newdate) { if ($lastdate ne "") {print FDATE "\t",$kc,"\n",$newdate;} # output event count from previous day and new date else {print FDATE $newdate;} # deals with first case $checkdate = ++$lastdate; if ($checkdate ne $newdate) {print FDATE "---> CHECK DATE\n";} print "\n",$newdate; $dots = 0; $lastdate = $newdate; $kc = 1; $seqno = "0001"; # sequence number } else { ++$kc; ++$seqno; } print FOUT $newdate," AFPF-$seqno-01 $serial"; # print record header ## write the records with maximum 80-character line lengths $first =~ s/

//; # get rid of the HTML markers $first =~ s/<\/P>//; $first =~ s/\.\s+\n/\.\n/; # get rid of trailing white space $start = 0; $end = 0; while ($first =~ m/ /g) { # break the sentence at word endings if ((pos($first) - $start)>= 79) { print FOUT substr($first, $start, $end - $start + 1),"\n"; $start = $end + 1; } else { $end = pos ($first) - 1; } } print FOUT substr($first, $start, length($first) - $start + 1),"\n"; # output the remainder print '.'; # show signs of life... if (++$dots > 40) { print "\n"; $dots = 0;} } else {print "-";} # show that something was wrong } close(FIN); } # end file loop print FDATE "\t",$kc,"\n"; # print final event count close(FDATE); close(FOUT); print "\nProgram has finished!";