## OSC.filter.pl ## ## This Perl program is used for combining files from the U.S. government's Open Source ## Center (http://opensource.gov) data system when the files were downloaded using ## the FireFox (http://www.mozilla.com/firefox/) extension DownThemAll (http://www.downthemall.net). ## It combines these into a single HTML file while removing minus most of the extraneous HTML ## support code from the original downloads. This was modified from the KEDS project program ## FactivaFilter. ## ## RUNNING PROGRAM ## ## Put the program in the folder with all of the files, then enter the command ## ## perl OSC.filter.1b0.pl output_file ## ## where "output_file" is the name of the file that will contain the combined stories. If ## the file name does not contain .html, that will be added as an extenson ## ## Example: ## ## perl OSC.filter.1b0.pl SASIA.Oct07 ## ## will put the output in a file named SASIA.Oct07.html ## ## INPUT FILES ## ## $file_list = "filter.files"; ## List of the OSC news files that are to be processed. ## ## Example: ## ## 7868908 ## 7868958 ## 7868982 ## 7869034 ## 7869043 ## ## This is easily generated by using "ls > filter.files" ## ## OUTPUT FILE ("SASIA.Oct07.html" in example) ## ## This combines all of the files in the "filter.files" list into a single HTML file, ## preserving the HTML formatting of the OSC story, but removing all of the other support ## code. Stories are separated by
## ## PROGRAMMING NOTES: ## ## None at present ## ## SYSTEM REQUIREMENTS ## This program has been successfully run on perl under the Mac OS-10.4 system ## ## ## PROVENANCE: ## Programmer: Philip A. Schrodt ## Kansas Event Data System Project ## Center for International Political Analysis ## University of Kansas ## Lawrence, Kansas, 66045 U.S.A. ## http://web.ku.edu/keds ## ## Copyright (c) 2008 Philip A. Schrodt. All rights reserved. ## ## Redistribution and use in source and binary forms, with or without modification, ## are permitted under the terms of the GNU General Public License: ## http://www.opensource.org/licenses/gpl-license.html ## ## Report bugs to: schrodt@ku.edu ## ## Modifications: ## 05-Jul-04: Initial version for FactivaFilter for AFP (FAST project) ## 15-Mar-05: Revised to handle Reuters for PITF ## 27-Jan-07: Revised to handle format shift to Windows line feeds ## 15-Jan-08: Revised to handle Open Source Center downloads ## 15-Apr-08: Revised to preserve HTML formatting ## #!/usr/local/bin/perl # ======== globals =========== # $file_list = "filter.files"; # ======== main program =========== # open(FDIR,$file_list) or die "Can\'t open list of input files $file_list; error $!"; #read output file and date file names, open same #$file = ; #chomp($file); if (length($ARGV[0]) <= 4) {print "No output file argument was supplied\nProgram is terminating\n";exit;} $filename = $ARGV[0]; if ($filename !~ m/\.html/) { $filename .= ".html";} open(FOUT,">$filename") or die "Can\'t open output file $filename; error $!"; #write html header print FOUT ""; print FOUT ""; print FOUT "\n",$ARGV[0],"\n\n\n\n"; $kb = 0; # count of files processed while ($file = ) { # file loop chomp($file); print "Processing $file\n"; open(FIN,"$file") or die "Can\'t open input file $file; error $!"; ### main record processing loop ### # skip file header while ($line = ) { if ($line =~ m/; # skip two lines $line = ; $line = ; $line =~ s/^[ \t]+//; # get rid of leading white space # process main text $first = $line; while ($line = ) { if ($line =~ m/
/ /gi; # get rid of tags if ($line =~m/
;} } if (length($line) > 4) {$first .= $line;} if ($line =~m//\n/g; # replace
with line feeds $first =~ s/
//g; # remove
$first =~ s/

/\n/gi; # replace

with line feeds $first =~ s/

/\n/gi; # replace

with line feeds $first =~ s/<\/P>//gi; $first =~ s/]*>//g; # remove //g; $first =~ s/"/"/g; # convert " $first =~ s/ / /g; # convert   $first =~ s/\r/\n/g; # replace internal Windows line feeds =cut print FOUT $first; print FOUT "\n\n


\n\n"; close(FIN); ++$kb; # if ($kb>10) {last} # debugging } # end file loop print FOUT "\n"; #terminate html close(FOUT); print "Program has finished!\n";