## OSC.filter.pl
##
## This Perl program is used for combining files from the U.S. government's Open Source
## Center (http://opensource.gov) data system when the files were downloaded using
## the FireFox (http://www.mozilla.com/firefox/) extension DownThemAll (http://www.downthemall.net).
## It combines these into a single HTML file while removing minus most of the extraneous HTML
## support code from the original downloads. This was modified from the KEDS project program
## FactivaFilter.
##
## RUNNING PROGRAM
##
## Put the program in the folder with all of the files, then enter the command
##
## perl OSC.filter.1b0.pl output_file
##
## where "output_file" is the name of the file that will contain the combined stories. If
## the file name does not contain .html, that will be added as an extenson
##
## Example:
##
## perl OSC.filter.1b0.pl SASIA.Oct07
##
## will put the output in a file named SASIA.Oct07.html
##
## INPUT FILES
##
## $file_list = "filter.files";
## List of the OSC news files that are to be processed.
##
## Example:
##
## 7868908
## 7868958
## 7868982
## 7869034
## 7869043
##
## This is easily generated by using "ls > filter.files"
##
## OUTPUT FILE ("SASIA.Oct07.html" in example)
##
## This combines all of the files in the "filter.files" list into a single HTML file,
## preserving the HTML formatting of the OSC story, but removing all of the other support
## code. Stories are separated by
##
## PROGRAMMING NOTES:
##
## None at present
##
## SYSTEM REQUIREMENTS
## This program has been successfully run on perl under the Mac OS-10.4 system
##
##
## PROVENANCE:
## Programmer: Philip A. Schrodt
## Kansas Event Data System Project
## Center for International Political Analysis
## University of Kansas
## Lawrence, Kansas, 66045 U.S.A.
## http://web.ku.edu/keds
##
## Copyright (c) 2008 Philip A. Schrodt. All rights reserved.
##
## Redistribution and use in source and binary forms, with or without modification,
## are permitted under the terms of the GNU General Public License:
## http://www.opensource.org/licenses/gpl-license.html
##
## Report bugs to: schrodt@ku.edu
##
## Modifications:
## 05-Jul-04: Initial version for FactivaFilter for AFP (FAST project)
## 15-Mar-05: Revised to handle Reuters for PITF
## 27-Jan-07: Revised to handle format shift to Windows line feeds
## 15-Jan-08: Revised to handle Open Source Center downloads
## 15-Apr-08: Revised to preserve HTML formatting
##
#!/usr/local/bin/perl
# ======== globals =========== #
$file_list = "filter.files";
# ======== main program =========== #
open(FDIR,$file_list) or die "Can\'t open list of input files $file_list; error $!";
#read output file and date file names, open same
#$file = ;
#chomp($file);
if (length($ARGV[0]) <= 4) {print "No output file argument was supplied\nProgram is terminating\n";exit;}
$filename = $ARGV[0];
if ($filename !~ m/\.html/) { $filename .= ".html";}
open(FOUT,">$filename") or die "Can\'t open output file $filename; error $!";
#write html header
print FOUT "";
print FOUT "";
print FOUT "\n",$ARGV[0],"\n\n\n\n";
$kb = 0; # count of files processed
while ($file = ) { # file loop
chomp($file);
print "Processing $file\n";
open(FIN,"$file") or die "Can\'t open input file $file; error $!";
### main record processing loop ###
# skip file header
while ($line = ) {
if ($line =~ m/; # skip two lines
$line = ;
$line = ;
$line =~ s/^[ \t]+//; # get rid of leading white space
# process main text
$first = $line;
while ($line = ) {
if ($line =~ m// /gi; # get rid of tags
if ($line =~m/
;}
}
if (length($line) > 4) {$first .= $line;}
if ($line =~m/
/\n/g; # replace
with line feeds
$first =~ s/
//g; # remove
$first =~ s//\n/gi; # replace
with line feeds
$first =~ s/
/\n/gi; # replace
with line feeds
$first =~ s/<\/P>//gi;
$first =~ s/]*>//g; # remove //g;
$first =~ s/"/"/g; # convert "
$first =~ s/ / /g; # convert
$first =~ s/\r/\n/g; # replace internal Windows line feeds
=cut
print FOUT $first;
print FOUT "\n\n
\n\n";
close(FIN);
++$kb;
# if ($kb>10) {last} # debugging
} # end file loop
print FOUT "\n"; #terminate html
close(FOUT);
print "Program has finished!\n";