#!/usr/bin/perl -w # pindex # Program to generate a permuted index for an input file. Inspired by UNIX # ptx, but hopefully with a more immediately usable output. # Version 2: adding more configuration options including input line format # specification, different field types, sort order flag, etc. # Perl port: this program was originally written in C, and in fact contained a # number of cunning bits of code to work around the limited amount of RAM # available to the program then (in the DOS version, this was particularly # relevant for the old large memory model). A lot of these problems disappear # with Perl, as does the need to maintain custom binary search functions and so # on. # # Created: 6/10/92 # Version 2: 4/8/93 # Perl port: 28/10/00 # # (c) Duncan Ellis # Released under the GPL. use strict; # These are defaults - need a different mechanism. use constant SEP1 => " : "; use constant SEP2a => " > "; use constant SEP2b => "*> "; use constant DEFAULT_IGNORE => "eign"; use constant DEFAULT_DELIM => " \t\n"; my $help = < [-ign |-only ] [-l ] [-k ] [-d ] -case ... where is a file conforming to the pattern ^\S+\s.+$ is a list of words to ignore, one per line is the set of words to use exclusively in constructing the index, one per line is a sequence of regex characters which will be added to the default set of ' \t\n' -case turns on case sensitivity in key comparison HELP my $line_width = 78; my $key_width = -1; # Implies no limit my $in_filename; my $out_filename; # only writes to stdout for now my $ignore_filename = 'eign'; my $only_filename; my $delim_filename; my $ignoreCase = 1; my $arg; for ($arg = shift; defined($arg); $arg = shift) { if ($arg eq "-ign") { $ignore_filename = shift; } elsif ($arg eq "-only") { $only_filename = shift; } # elsif ($arg eq "-o") { $out_filename = shift; } elsif ($arg eq "-k") { $key_width = shift; } elsif ($arg eq "-l") { $line_width = shift; } elsif ($arg eq "-d") { $delim_filename = shift; } elsif ($arg eq "-case") { $ignoreCase = 0; } elsif ($arg eq "--help" || $arg eq "-?") { print $help; exit; } elsif (!defined($in_filename)) { $in_filename = $arg; } } $::caseForceFunction = $ignoreCase ? sub { my $word = shift; return lc($word); } : sub { my $word = shift; return $word; }; my $ignore = defined($ignore_filename) ? read_ignore_keys($ignore_filename) : undef; my $only = defined($only_filename) ? read_only_keys($only_filename) : undef; my $delims = defined($delim_filename) ? DEFAULT_DELIM . read_delimiters($delim_filename) : DEFAULT_DELIM; open IN_FILE, $in_filename || die "*** Failed to open input file: $!"; my $max_key_len = 0; my $max_ref_len = 0; my %keywords = (); while () { chomp; # Extract folder reference and unprocessed source component (going onto # next line if not matched) my $line = $_; next unless $line =~ /^(\S+)\s(.+)$/; my $reference = $1; my $source = $2; $max_ref_len = length($reference) if (length($reference) > $max_ref_len); my %these_keys; my @words = split /[$delims]/, $line; shift @words; # already got the reference foreach my $word (@words) { # Ignore empty keys next if ($word eq ""); # Figure out if the word is to be ignored or excluded. # BUGBUG This could be improved by using regexes for the ignore/only # expressions at the cost of slowing down the check. next if (defined($ignore) && exists($$ignore{&$::caseForceFunction($word)})); next unless (!defined($only) || exists($$only{&$::caseForceFunction($word)})); # Have we already seen this word? next if (exists($these_keys{&$::caseForceFunction($word)})); $these_keys{&$::caseForceFunction($word)} = ""; my $permutation = { 'key' => $word, 'ref' => $reference, 'src' => $source }; # push @{$keywords{lc($word)}}, $permutation; push @{$keywords{&$::caseForceFunction($word)}}, $permutation; # Calculate maximum keyword length $max_key_len = length($word) if (length($word) > $max_key_len); } } close IN_FILE; # Figure out format strings for displaying permuted information if (($key_width > -1) && ($max_key_len > $key_width)) { $max_key_len = $key_width; } my $source_len = $line_width - length(SEP1) - length(SEP2a) - $max_ref_len - $max_key_len; my $full_source = sprintf("%c-%ds%s%c-%ds%s%c%ds\n", ord('%'), $max_key_len, SEP1, ord('%'), $source_len, SEP2a, ord('%'), $max_ref_len); my $part_source = sprintf("%c-%ds%s%c-%ds%s%c%ds\n", ord('%'), $max_key_len, SEP1, ord('%'), $source_len, SEP2b, ord('%'), $max_ref_len); foreach my $key (sort keys(%keywords)) { foreach my $perm (@{$keywords{$key}}) { my $outSource = length($$perm{src}) > $source_len ? substr($$perm{'src'}, 0, $source_len) : $$perm{'src'}; my $outFormat = length($$perm{'src'}) > $source_len ? $part_source : $full_source; printf $outFormat, $$perm{'key'}, $outSource, $$perm{'ref'}; } } # read_ignore_keys() # # Reads in keywords to be ignored from the given file. # # Params: filename to be read from # # Return: hash containing ignore words mapped to itself if successful, # undef otherwise. sub read_ignore_keys { my $ignoreFile = shift; open(IGNORE, $ignoreFile) || return undef; my @words = ; close IGNORE; chomp @words; # my %ignore = map { (lc($_), $_) } @words; my %ignore = map { (&$::caseForceFunction($_), $_) } @words; return \%ignore; } # End of read_ignore_keys() # read_only_keys() # # Reads in keywords to be used exclusively from the given file. # # Params: filename to be read from # # Return: hash containing only words mapped to itself if successful, # undef otherwise. sub read_only_keys { open ONLY, shift || die "*** Failed to open only file: $!"; my @words = ; close ONLY; chomp @words; # my %only = map { (lc($_), $_) } @words; my %only = map { (&$::caseForceFunction($_), $_) } @words; return \%only; } # End of read_only_keys() # read_delimiters() # # Reads in characters from delimiters file and appends them to the default # string. # # Params: filename to read from # # Return: delimiter string sub read_delimiters { open DELIM, shift || die "*** Failed to open delimiter file: $!"; my $delims = ; close DELIM; chomp $delims; return $delims; } # End of read_delimiters()