## nexispider.pl ## ## This Perl program follows a set of linked news stories generated by ## the NEXIS Academic Universe system, then formats those stories into ## the KEDS format. ## ## This version of the program reads the initial URL from a file named ## "www.input". This file should be in the same directory/folder as the ## program. This system was used because the terminal emulator used to ## access the Unix machine where this code was developed would not correctly ## transfer lines of more than 255 characters. If you do not have this ## problem, just read the first $nexturl string directly from . ## ## Output is in a file named "$fileprfx.YYMMDD-YYMMDD" where $fileprfx is the ## file name prefix string that is entered when the program is started, and ## the YYMMDD are the dates of the first and last records in the file. ## For example: AFPLV.980831-980815 ## ## Note that NEXIS provides the stories in reverse chronological order, ## whereas KEDS needs these in forward order; another program will be ## required to change this. ## ## The program is currently set to process only Agence France Presse ## records. As usual, the information required to identify the beginning ## and ending of the story are somewhat idiosyncratic and may need to be ## modified to work with other sources. ## ## FULL STORIES AUTOMATICALLY SKIPPED ## ## 1. "SECTION: Sports ## ## 2. Various news summaries and calendars ## ## 3. Graphics advisories -- key phrase is "Text slug:" ## ## 3. Middle East headlines by checking for the string ">Headlines across" ## ## SENTENCES AUTOMATICALLY SKIPPED ## ## 1. if $do_quotes == 0, then any line beginning or ending with " is ## skipped. This seems to eliminate almost all of the direct quotations, ## while leaving intact small internal quote-delimited phrases. ## ## 2. AFP erratically includes a dateline before the beginning of the text; ## this is skipped when it ends in the form "(AFP) - " ## ## 3. Sentences containing "ATTENTION - " ## ## 4. Sentences less than 72 characters in length: virtually all AFP sentences ## reporting news events run two or three 80-character lines, so single- ## line "sentences" are almost always junk such as authorship and subject ## information that escaped the other filters. ## ## ADDITIONAL NOTES ## ## 1. "

" is used as the primary sentence separator. The regular ## expression ## \. \n?([A-Z"]) ## catches some additional cases where AFP puts multiple sentences in ## the same paragraph. ## ## 2. The program currently contains almost no error checking, and NEXIS ## downloads may contain an assortment of dropped headers and the like. ## Beware. ## ## 3. This system does not check for duplicate stories. ## ## TO USE THE PROGRAM ## ## 1. Get into the NEXIS "Academic Universe" site using a web browser such ## as Netscape Navigator or Microsoft Explorer. ## ## 2. Click the "Sources" button, and on the resulting page, select "Agence ## France Presse". This will give only AFP stories. ## ## 3. Click the "Search" button, and enter the keywords and dates that you ## want to download. Note that NEXIS will reject any combination that ## produces more than 1000 documents. ## ## 4. When NEXIS has completed a search, it will display a list of all of the ## stories. Click the link to the first story. ## ## 5. Once you see the full text of the first story displayed, copy the URL ## for that story, then paste it into the file "www.input" and save that ## file. This URL is the starting point for the download. A simple editor ## such as pico (in Unix) can be used to edit and save "www.input". ## ## 6. Run the program using the Perl interpreter on your machine (in Unix, the ## command "perl nexispider.pl" will do this). ## ## INTELLECTUAL PROPERTY ISSUES ## ## The NEXIS "Academic Universe" site is a very useful and low-cost source of ## news stories for academic research. Information can be *temporarily* saved ## from the site for use in not-for-profit academic research without violating ## NEXIS's intellectual property rights or license provisions. However, all ## of this text is copyrighted and should not be used in a manner beyond the ## "fair use" provisions of the copyright code, nor should the text be trans- ## ferred outside the institution which holds the "Academic Universe" license. ## Failure to follow these reasonable provisions may result in the entire ## community losing access to this source, much as happened earlier with ## Reuters. Thank you for your cooperation. ## ## SYSTEM REQUIREMENTS ## ## This program has been successfully run on both Unix (Digital UNIX V4.0F -Rev. 1229) ## and Macintosh (MacPerl 5.2; Mac OS 9) systems; presumably it will also run in Windows. ## ## japh; 26-November-00 ## ## Revision history: ## 16 Jan 02: revision of date conversion to handle change in Nexis formatting ## 21 Jan 03: following features added: ## -- Program now requests the file prefix ## -- "Top ... stories" filter ## -- short (length<72) sentence filter ## -- division of multiple-sentence paragraphs filter ## 15 Aug 03: revised to accomodate additional NEXIS format changes ## #!/usr/local/bin/perl use LWP::Simple; $webfiles = 1; # 1 if reading from web; 0 if testing on AFP.test.n files $do_quotes = 0; # if 0, do not include lines beginning or ending with " -- this gets most direct quotes $fileprfx = 'AFPTST'; # set final file name prefix $seqno = '0001'; $oldate = '000000'; $firstdate = '--'; # this will be used to re-name the output file %month_number = ( # hash used to translate dates Jan => '01', Feb => '02', Mar => '03', Apr => '04', May => '05', Jun => '06', Jul => '07', Aug => '08', Sep => '09', Oct => '10', Nov => '11', Dec => '12', ); if ($webfiles) { open (INFILE, "); $outfile = 'NexiSpdr.'.substr($nexturl,length($nexturl)-6,6); #assign a unique identifier to file close(INFILE); print "Enter file prefix-->"; chomp($fileprfx = ); } else { $currentfile = "$outfile"); print "Temporary output file: $outfile\n"; $ka = 1; while ($ka >= 0) { # ka is here for testing purposes; normal exit is "if ($nextidx < 0)..." print "\nFetching document ", $ka,"\n"; if ($webfiles) { $doc = get $nexturl; } # get the document else { # file-based method print "Trying to read ", $currentfile, "\n"; open (INFILE, $currentfile); chomp($doc = ); } # print OUTFILE "Original text\n$doc\n\n"; # debugging ## do not process sports stories, news summaries if ($doc =~ m/SECTION:<\/b> Sports/) { print " Skipped sports story\n"; next;} if ($doc =~ m/TOP WORLD NEWS STORIES/) { print " Skipped Top World News\n"; next;} if ($doc =~ m/op.{0,16} stories for /) { print " Skipped \"Top ... stories\"\n"; next;} if ($doc =~ m/News Summary for /) { print " Skipped \"News Summary\"\n"; next;} if ($doc =~ m/News Calendar for /) { print " Skipped \"News Calendar\"\n"; next;} if ($doc =~ m/Text slug:/) { print " Skipped graphics advisory\n"; next;} if ($doc =~ m/>Headlines across/) { print " Skipped Middle East headlines\n"; next;} ## check that the source is correct $bodyidx = index($doc,"Copyright"); $langidx = index($doc," $bodyidx = index($doc,"", $langidx); $langidx = index($doc,"(.+)...< $month = $1; $date =~ m/,(\s+)(\d+)/; # year follows comma, white-space $newdate = substr($2,2,2).$month_number{substr($month,0,3)}; if (1 == length($day)) { $newdate .= "0".$day } # prefix a zero to day if needed else { $newdate .= $day } if ($newdate != $oldate) { # reset the story sequence number $seqno = "0001"; $oldate = $newdate; print " NewDate: ",$newdate, "\n"; if ($firstdate =~ m/\A-/) { $firstdate = $newdate } # capture the date of the initial record } else { ++$seqno; } $bodyidx = index($doc,"BODY:"); # start of story text $langidx = index($doc,"",$bodyidx);# end of bodytext division $body = substr($doc,$bodyidx,$langidx-$bodyidx); $body =~ s/
/
/g; # reformat CSS breaks (new to NEXIS 03.08.15) $body =~ s/

/_BR_/g; # mark the usual sentence breaks # print OUTFILE "Marked text\n$body\n\n"; # debugging ## Convert from HTML to text $body =~ s/<([\w=" ])*>//g; # get rid of all the HTML tags $body =~ s/<\/(\w*)>//g; # get rid of any terminal HTML tags $body =~ s/\n//g; # get rid of new lines $body =~ s/\r//g; # get rid of returns $body =~ s/&\#160;//g; # get rid of   (spaces) $body =~ s/"/\"/g; # instantiate the quotes $body =~ s/BODY://; # get rid of header $body =~ s/\. _B/\. _B/g; # eliminate double spaces after periods $body =~ s/\. ([A-Z"])/\. _BR_\1/g; # mark the intra-paragraph sentence breaks ## split the text into sentences for output @sentences = split(/_BR_/,$body); $sentno = "01"; # initialize sentence number foreach $line(@sentences) { last if ($line =~ m/LANGUAGE:/); # test for the final line unless ($doquotes) { # skip lines that start or end with " (usually direct quotations) next if (($line =~ m/^\"/) || ($line =~ m/\"(\s*)$/)) } # next if ((length($line) < 72) && ($line =~ m/\//)); # skip single line sentences -- old version next if (length($line) < 72) ; # skip single line sentences next if ($line =~ m/ATTENTION - /); # skip "ATTENTION - " lines print OUTFILE "$newdate AFPN-$seqno-$sentno\n"; # write the identification line ## write the records with maximum 80-character line lengths if ($line =~m/(AFP) - /g) { $start = pos; } # skip over the occasional "...(AFP) - " headers else { $start = 0; } $end = 0; while ($line =~ m/ /g) { # break the sentence at word endings if ((pos($line) - $start)>= 79) { print OUTFILE substr($line, $start, $end - $start + 1),"\n"; $start = $end + 1; } else { $end = pos ($line) - 1; } } print OUTFILE substr($line, $start, length($line) - $start + 1),"\n\n"; # output the remainder print $sentno, " ", substr($line,0,15), "\n"; # provide some feedback to screen ++$sentno; } # foreach loop ## find the URL for the next document } continue { if ($webfiles) { $nextidx = index($doc,"nextactive.gif"); # this is the "Next" arrow on the NEXIS screen if ($nextidx < 0) { print "\n\aCan\'t find nextactive.gif: segment\n"; last; } $start = rindex($doc,"HREF=",$nextidx); $end = index($doc, "\"",$start+8); $nexturl ='http://web.lexis-nexis.com' . substr($doc,$start+6,$end-$start-6); # print "\n\nNext URL:", $nexturl, "\n"; } else { # get the next file name in test mode ++($currentfile); } ++$ka; } # continue block for while loop print "Finished:\n\a$outfile was renamed $fileprfx.$newdate-$firstdate\n\n"; close(OUTFILE); rename($outfile, "$fileprfx.$newdate-$firstdate"); # rename the file with the correct dates exit;