## nexisreverse.pl ## ## This Perl program reverses the order of stories that were downloaded from ## NEXIS using the nxdnldformat.pl or nexispider.pl programs (more generally, ## it will reverse the order of any "KEDS-formatted" files). The program ## solves the problem of NEXIS downloading stories in reverse chronological ## order, while event data coding usually needs records in chronological ## order. The program also combines multiple NEXIS downloads (which are ## limited to 1,000 records in the Academic Universe subscription) into ## a single file, and the current version gets only the lead sentence. ## (The lead sentence is recognized by looking for a string of the form ## "-dddd-01" in the identification line of the record; this is the ## standard KEDS/TABARI designator for the first line of a story.) ## ## INPUT FILE: ## $file_list : This file contains the names of all of the files that will be ## combined, with one file name per line. These should be listed ## in *chronological order* (not reverse order). If you are ## working on a Unix system, the quick way to generate this is ## with redirection of a directory listing: ## ls > filelist ## All of the files in this list should also be in the same ## directory as the program. ## ## OUTPUT FILES: ## $output_name : The combined, chronologically listed records are in the file. ## $summary_name: This gives a record of the files processed and the number of ## records found for each day. It is a *very* good idea to ## scan this (or plot it in Excel -- the file is tab-delimited) ## to make sure no days have been skipped. ## ## TEMPORARY FILES: ## The program creates a large number of temporary files with names of the ## form SCR.dddddd, where dddddd is a date. These are deleted when the ## program terminates normally, but would left if something went wrong. ## They can easily be deleted with a global remove (rm SCR.*) in Unix. On the ## Mac, just select and trash'em. ## ## PROGRAM LOGIC ## Pretty simple: every time the program finds a new date, it creates a new ## temporary file ("SCR" = "scratch file"; that's old Control Data Corp. ## terminology for a temporary file), and writes all of the records to that ## file. When it hits the end of input file,it goes back and transfers the ## records in all of those files to the $output_name. Repeat for every file ## listed in $file_list. ## ## ERROR CHECKING AND FILTERING ## 1. Very occasionally, some of the nxdnldformat.pl records are missing ## dates. When this occurs, the program substitutes the current date. ## This will also eliminate garbage dates (that is, lines that do not ## begin with 6 consecutive digits). ## ## 2. The statement ## if ($line =~ m/\(AFP\) - /) { print FSCR $'; } ## is there to get rid of some AFP datelines that somehow escaped ## nexis_spider. Note that this loop also eliminates lines that contain ## only a single character -- some lines containing a single blank were ## filtered out using this. ## ## 3. Program eliminates records where the first line is identical to the first ## line of an earlier story. This eliminates many, but not all, duplicate ## stories. ## ## 4. Program eliminates records that have only one line of text. Except for ## very rare occasions, these are junk that escaped other filters. ## ## 5. When records are found to contain multiple sentences, where a sentence ## split is defined by the pattern ## m/\. [A-Z"\n]/ ## only the first sentence is transferred. ## ## 6. "top ... stories" headers are eliminated ## ## ## MODIFICATIONS ## 1. Program is currently set to just pull out lead sentences. To get *all* ## sentences, change "01" to "\d\d" in the line ## ## if ($line =~ m/-\d\d\d\d-01/) ## ## and modify the statement ## ## print FSCR substr($line,0,index($line,'-')),"-$seqno-01\n"; ## ## to use the original sentence number rather than fixing this at '01' For ## example, for sequence headers formated ## ## 020101 AFPN-0005-01 ## ## this could be done using ## ## print FSCR substr($line,0,index($line,'-')),"-$seqno",substr($line,17,19),"\n"; ## ## ## OTHER NOTES ## 1. Note that this program is set up to use 6-digit, rather than 8-digit, dates. ## ## 2. Given the simplicity of the logic of this program, it would be easy to ## incorporate into nexis_spider.pl. We're using the second pass at the data ## to do some additional filtering. ## ## 3. There are probably people who would appreciate it if this had an interface ## where the file names were entered from the keyboard rather than set in ## the program. ## ## 4. The filtering in [4] - [6] are now redundant since these have been ## incorporated into nexispider.pl. However, this can be used to post-filter ## files that have been previously downloaded. ## ## SYSTEM REQUIREMENTS ## This program has been successfully run on both Unix (Digital UNIX V4.0F - Rev. 1229) ## and Macintosh (MacPerl 5.2; Mac OS 9) systems; presumably it will also run in Windows. ## ## japh; 15-March-01 ## ## Modifications: ## 26-January-03: duplicate check and single-line filters added ## #!/usr/local/bin/perl ## GLOBAL FILE NAMES $file_list = "filelist"; $output_name = "reverse.output"; $summary_suffix = "summary"; $summary_name = "$file_list\.$summary_suffix"; open(FOUT,">$output_name") or die "Can\'t open output file $output_name; error $!"; open(FSUM,">$summary_name") or die "Can\'t open summary file $summary_name; error $!"; open(FDIR,$file_list) or die "Can\'t open list of input files $file_list; error $!"; ## write header of the summary file $rightnow = localtime; print "\nRunning program \"nexisreverse\"\nOutput file: $output_name\nFile list: $file_list\n"; print FSUM "nexisreverse program summary\nDate of program run: $rightnow\n"; print FSUM "Output file: $output_name\nFile list: $file_list\nFiles processed:\n"; while ($file = ) { print FSUM $file; } print FSUM "\n Date\tRecords\n"; close(FDIR) or die "Can\'t close list of input files $file_list; error $!"; open(FDIR,$file_list) or die "Can\'t re-open list of input files $file_list; error $!"; while ($file = ) { print "Reading $file"; open(FIN,$file) or die "Can\'t open input file $file; error $!"; $curdate = "00000"; while ($line = ) { if ($line =~ m/-\d\d\d\d-01/) { # select only the lead sentence. # To get all sentences, change "01" to "\d\d" if ($line =~ m/^\d\d\d\d\d\d/) { # look for a date $date = $&; # got it if ($date ne $curdate) { # check whether we've got a new date #print "New date: $date\n"; $curdate = $date; $seqno = "0001"; # record sequence number $maxdup = 0; # index of duplicates array push @dates, "SCR\.$curdate"; # save the scratch file name open(FSCR,">SCR\.$curdate") or die "Can\'t open temporary file SCR\.$curdate; error $!"; # open a new file } } else { $date = $curdate; print "Bad date: $line"; } # if no date found, assume it is the current date $line1 = ; # get next two lines $line2 = ; for ($idup = 0; $idup < $maxdup; ++$idup) { # check for duplicates if (@dup[$idup] eq $line1) { # got one $line2 = '\n'; # flag this to be skipped last; } } if ($line1 =~ m/top.{1,16}stories/) { $line2 = '\n';} # filter to get rid of "top ... stories" if (length($line2) > 2) { # record is okay; copy it @dup[$maxdup++] = $line1; # save first line print FSCR substr($line,0,index($line,'-')),"-$seqno-01\n"; ++$seqno; if ($line1 =~ m/\(AFP\) - /) { print FSCR $'; } # filter to get rid of AFP date lines else { print FSCR $line1; } # only this is needed if no filtering is being done $line = $line2; while (length($line) > 2) { # transfer the record. The ">2" eliminates lines containing a single blank if ($line =~ m/\. [A-Z"\n]/) { # filter to get rid of multiple sentence records print FSCR "$`\. \n"; while (length($line) > 1) { $line = ;} # skip remainder of record } else { print FSCR $line; # only this is needed if no filtering is being done $line = ; } } print FSCR "\n"; # blank line for record separator } else { # skip short records $line = $line2; while (length($line) > 1) { $line = ;} } } else { # skip any record that doesn't end with "-01" while (length($line) > 1) { $line = ;} } } #print "Finished reading...\n"; while ($curfile = pop @dates) { open(FSCR,$curfile) or die "Can\'t re-open temporary file $curfile; error $!"; #print "Transfering $curfile\n"; $nrecs = 0; while ($line = ) { print FOUT $line; if ($line =~ m/-\d\d\d\d-01/) { ++$nrecs; } # count the number of records each day } $curfile =~ m/SCR\./; print FSUM "$'\t$nrecs\n"; close(FSCR) or die "Can\'t close temporary file $curfile; error $!"; unlink $curfile; # try to delete all of the scratch files -- not working on the Mac } } close(FSUM) or die "Can\'t close summary file $summary_name; error $!"; print "\nProgram has finished!\nSummary is saved in $summary_name\n";