## NewNexisFormat.pl ## ## This Perl program formats the stories in a series of files downloaded from LexisNexis ## using their "Download Documents" options, and puts these stories into the TABARI ## input format. ## ## Output is a series of files named "$fileprfx.YYMMDD-YYMMDD" where $fileprfx ## is the file name prefix string that is entered when the program is started, ## and the YYMMDD are the dates of the first and last records in the file. ## For example: AFPLV.980831-980815 ## ## Note that NEXIS provides the stories in reverse chronological order, whereas ## TABARI needs these in forward order; the program nexisreverse.pl can be used to ## change this. See the file NewNexisFilter.readme.txt for instructions on how to ## combine these two programs ## ## The program is currently set to process only Agence France Presse ## records. As usual, the information required to identify the beginning ## and ending of the story are somewhat idiosyncratic and may need to be ## modified to work with other sources. ## ## FULL STORIES AUTOMATICALLY SKIPPED ## ## 1. Sports stories ## ## 2. Various news summaries and calendars ## ## SENTENCES AND PHRASES AUTOMATICALLY SKIPPED ## ## 1. if $do_quotes == 0, then any line beginning or ending with " is ## skipped. This seems to eliminate almost all of the direct quotations, ## while leaving intact small internal quote-delimited phrases. ## ## 2. AFP erratically includes a dateline before the beginning of the text; ## this is skipped when there is a ':' in the first 16 characters of the story ## ## 3. The obnoxious     markers (Unicode-x2020) ## ## 3. Sentences less than 72 characters in length: virtually all AFP sentences ## reporting news events run two or three 80-character lines, so single- ## line "sentences" are almost always junk such as authorship and subject ## information that escaped the other filters. ## ## ADDITIONAL NOTES ## ## 1. This system does not check for duplicate stories. ## ## 2. The program currently contains almost no error checking, and NEXIS ## downloads may contain an assortment of dropped headers and the like. ## Beware. ## ## 3. Because LexisNexis downloads are sent in Windows file format, the ## program automatically converts these to Unix format. If you are working ## in Windows, you'll need to skip this. ## ## TO USE THE PROGRAM (Macintosh) ## ## 1. Get into the NEXIS "Academic Universe" site using a web browser such ## as Netscape Navigator or Microsoft Explorer. ## ## 2. Click the "Sources" button, and on the resulting page, select "Agence ## France Presse". This will give only AFP stories. ## ## 3. Click the "Search" button, and enter the keywords and dates that you ## want to download. You can increase the efficiency of your downloads ## by eliminating sports stories and news summaries: these will be skipped ## anyway during the formatting. ## ## 4. When NEXIS has completed a search, it will display a list of all of the ## stories. Click the the "Download Documents" option (the icon that ## quaintly looks like a disk -- remember those, from those primitive ## times before we had flash drives?). ## ## 5. Use "Select Items" in the "Document Range" box to download a maximum of ## 500 stories at a time. Also select (and unselect) the following options ## for the download file ## ## Format: text ## Document View: Full document ## Font: Courier ## ## No additional options should be checked. ## ## 6. Put all of the Nexis files you intend to filter and this program in a ## folder/directory: ## ## 7. Open the Terminal (command-line) and move to that folder ## ## 8. Assuming your Nexis downloads have a file name of the form ## ## Agence_France_Presse_-_English2007-09-14_16-31.TXT ## ## enter the command ## ## ls Agence_Fr* > format.files ## ## Alternatively, just use the command ## ## ls > format.files ## ## and manually edit out all of the files that are not to be formated ## ## 9. Enter the command ## ## perl NewNexisFormat.pl ## ## where is the prefix for the formatted file. For example ## ## perl NewNexisFormat.pl AFPLVT ## ## 10. Program should run, with the dates and headlines of the various stories ## scrolling past as they are processed. If the program stops working -- crashes ## or stops responding -- the last story displayed (or the one following it) is ## probably the cause, so just eliminate that story and try running the program ## again. ## ## INTELLECTUAL PROPERTY ISSUES ## ## The NEXIS "Academic Universe" site is a very useful and low-cost source of ## news stories for academic research. Information can be *temporarily* saved ## from the site for use in not-for-profit academic research without violating ## NEXIS's intellectual property rights or license provisions. However, all ## of this text is copyrighted and should not be used in a manner beyond the ## "fair use" provisions of the copyright code, nor should the text be trans- ## ferred outside the institution which holds the "Academic Universe" license. ## Failure to follow these reasonable provisions may result in the entire ## community losing access to this source, much as happened earlier with ## Reuters. Thank you for your cooperation. ## ## SYSTEM REQUIREMENTS ## ## This program has been successfully run under Mac OS 10.4; it is standard perl ## in Unix. If running in Windows, presumably the Window->Unix file conversation ## code can be eliminated. ## ## PROVENANCE: ## Programmer: Philip A. Schrodt ## Kansas Event Data System Project ## Center for International Political Analysis ## University of Kansas ## Lawrence, Kansas, 66045 U.S.A. ## http://web.ku.edu/keds ## ## Copyright (c) 2008 Philip A. Schrodt. All rights reserved. ## ## Redistribution and use in source and binary forms, with or without modification, ## are permitted under the terms of the GNU General Public License: ## http://www.opensource.org/licenses/gpl-license.html ## ## Report bugs to: schrodt@ku.edu ## ## For plausible indenting of this source code, set the tab size in your editor to "2" ## ## REVISION HISTORY: ## 22-March-08: Initial version ## #!/usr/local/bin/perl #use LWP::Simple; # ======== globals =========== # $do_quotes = 0; # if 0, do not include lines beginning or ending with " -- this gets most direct quotes $filetemp = 'nnftempfile.txt'; # temp file name of Windows format conversion $file_list = "format.files"; # list of files to be processed $outfile = 'nnfoutfile.txt'; # output file; this will be renamed # $currentfile = "Agence_France_Presse_-_English2007-09-14_15-59.TXT"; # debiug %month_number = ( # hash used to translate dates Jan => '01', Feb => '02', Mar => '03', Apr => '04', May => '05', Jun => '06', Jul => '07', Aug => '08', Sep => '09', Oct => '10', Nov => '11', Dec => '12', ); # ======== main program =========== # $fileprfx = $ARGV[0]; # set final file name prefix # ### convert this is asking for prefix; also make doquotes an option <08.03.22> if (length($fileprfx) == 0) { print "\aFile name prefix is a required argument! -- please re-run program\n"; exit; } open(FDIR,$file_list) or die "Can\'t open list of input files $file_list; error $!"; $kfile = 0; while ($filename = ) { # file loop $kfile++; chomp($filename); print "\n==========================\nProcessing $file\n"; # convert from Windows to Unix format print "\nReading file $filename\n"; open (MODIFY, "<$filename") or die "Can\'t open input file $filename; error $!"; my @filechng = ; map { s/\r//g; } (@filechng); close(MODIFY); open (MODIFY, ">$filetemp") or die "Can\'t open temporary file $filetemp; error $!"; print MODIFY @filechng or die "Problem printing: $@\n"; close(MODIFY); open (OUTFILE, ">$outfile") or die "Can\'t open output file $outfile; error $!"; print "Temporary output file: $outfile\n"; open (INFILE, "<$filetemp") or die "Can\'t open modified input file $filetemp; error $!"; $seqno = '0001'; $oldate = '000000'; $firstdate = '--'; # this will be used to re-name the output file $ka = 0; while ($ka++ >= 0) { # ka is here for debugging; normal exit is "if (m/LOAD-DATE:/" or other {last} clauses print "\nReading story $filename $ka\n"; $line = ; # while ($line =~ m/\d+ OF \d+ DOCUMENTS/) {$line = ;} while (($line !~ m/Agence France Presse -- English/) && (!eof)) {$line = ;} if (eof) {last;} # print "AFP LINE:",$line; $line = ; $line = ; ## Extract the date, convert to KEDS format # print "Date block: $line\n"; # debugging $line =~ m/(\w+ )(\d+), (20\d\d)/; # print "Date extracts: $1 : $2 : $3\n"; $day = $2; $month = $1; $date =~ m/,(\s+)(\d+)/; # year follows comma, white-space $newdate = substr($3,2,2).$month_number{substr($month,0,3)}; if (1 == length($day)) { $newdate .= "0".$day } # prefix a zero to day if needed else { $newdate .= $day } if ($newdate != $oldate) { # reset the story sequence number $seqno = "0001"; $oldate = $newdate; print " NewDate: ",$newdate, "\n"; if ($firstdate =~ m/\A-/) { $firstdate = $newdate } # capture the date of the initial record } else { ++$seqno; } $line = ; $line = ; # should be at the headline now print "HEADLINE: ",$line; ## do not process sports stories, news summaries if ($line =~ m/Football:/) { print " Skipped sports story\n"; next;} if ($line =~ m/Tennis:/) { print " Skipped sports story\n"; next;} if ($line =~ m/Basketball:/) { print " Skipped sports story\n"; next;} if ($line =~ m/Baseball:/) { print " Skipped sports story\n"; next;} if ($line =~ m/Cricket:/) { print " Skipped sports story\n"; next;} if ($line =~ m/AFP Middle East [Nn]ews /) { print " Skipped Middle East news agenda\n"; next;} if ($line =~ m/AFP .+ news advisory/i) { print " Skipped News Advisory\n"; next;} if ($line =~ m/AFP .+ news agenda/i) { print " Skipped News Advisory\n"; next;} if ($line =~ m/GMT news advisory/i) { print " Skipped News Advisory\n"; next;} if ($line =~ m/AFP features/i) { print " Skipped AFP features\n"; next;} if ($line =~ m/AFP news calendar/i) { print " Skipped news calendar\n"; next;} if ($line =~ m/AFP world news /i) { print " Skipped world news agenda\n"; next;} while ($line !~ m/DATELINE:/) {$line = ;} chomp($dateline = substr($line,9)); print "Dateline: $dateline\n"; $line = ; $line = ; $sentno = "01"; # initialize sentence number $kb=0; chomp($line = ); while ($line !~ m/LOAD-DATE:/) { # if ($kb++> 8) {last;} # print "+++ $line\n"; while (length($line) == 0) {chomp($line = );} # print $line,"<--\n"} if ($line =~ m/SUBJECT:/) {last;} if ($line =~ m/ORGANIZATION:/) {last;} if ($line =~ m/GEOGRAPHIC:/) {last;} if ($line =~ m/LOAD-DATE:/) {last;} $sent = $line; $kc = 0; while (chomp($line = )) { if (length($line) > 0) { $sent .= " " . $line; } # print " Mk4 $line \n";} # if ($kc++ >32) {exit}; } else {last;} } chomp($line = ); # first line of next block $sent =~ s/ //g; # get rid of those crazy x2020 symbols $indx = index($sent,':'); if (($indx > 0) && ($indx <=16)) {$sent = substr($sent, $indx + 2)} # remove DATELINES in sentences unless ($doquotes) { # skip lines that start or end with " (usually direct quotations) next if (($sent =~ m/^\"/) || ($sent =~ m/\"(\s*)$/)) } next if (length($sent) < 72) ; # skip single line sentences print OUTFILE "$newdate AFPN-$seqno-$sentno $dateline\n"; # write the identification line ## write the records with maximum 80-character line lengths : doesn't seem to be working... $start = 0; $end = 0; while ($sent =~ m/ /g) { # break the sentence at word endings if ((pos($sent) - $start)>= 79) { print OUTFILE substr($sent, $start, $end - $start + 1),"\n"; $start = $end + 1; } else { $end = pos($sent) - 1; } } print OUTFILE substr($sent, $start, length($sent) - $start + 1),"\n\n"; # output the remainder # print $sentno, " ", substr($sent,0,15), "\n"; # provide some feedback to screen ++$sentno; } # while LOAD-DATE loop } # while ($ka++ >= 0) # rename the file with the correct dates rename($outfile, "$fileprfx.$newdate-$firstdate") or die "Can\'t rename output file $filetemp; error $!"; print "Finished:\n$outfile was renamed $fileprfx.$newdate-$firstdate\n"; close(OUTFILE); close(INFILE); } # while ($file = ) unlink($filetemp) or die "Can\'t delete temporary file $filetemp; error $!"; print "\n\aFinished: $kfile files processed\n"; exit;