## Unix Version Patch -- architext_query.pl ## This updated version of this library file removes ## a security hole that made shell-based hacking possible via CGI ## Architext perl library ## Copyright (c) 1996 Excite, Inc. package ArchitextQuery; require 'afeatures.pl'; require 'os_functions.pl'; ## should we do QBE? sub qbeMode { return $query_by_example_mode; } ## should we do summaries? sub summaryMode { return $summary_mode; } ## should scripts do subject grouping? sub asgMode { return $subject_group_mode; } ## are we in custom format mode? sub customFormat { return $custom_format; } sub helpPath { return $help_path; } ## is notifier enabled? sub notifyMode { return $notifier_on; } sub NSOn { $no_stemming_mode; } sub indexHtmlComments { return $index_html_comments; } ## also checks for existence of fielded data which implies new ## query syntax sub checkForIndex { local(%form) = @_; if (! -e "$form{'index'}.dat") { print "
Apparently, there is no index available for this\n";
print "collection, so queries cannot be made at this time.";
exit(0);
}
return (-e "$form{'index'}.fidx");
}
## returns value of the $show_legend variable, 'top' or 'bottom'
## or nothing at all.
sub legendLocation {
return $show_legend;
}
## returns value of $restrict_beneath_document_root variable
sub restrictBeneathRoot {
return $restrict_beneath_document_root;
}
sub errorIcon {
local($url) = @_;
return "";
}
sub indexTimeMapping {
return $index_time_mapping;
}
## forwards the search to www.excite.com if appropriate
sub directQuery {
local($search, $mode, $db, $source, $backlink, $linktext, $logfile) = @_;
local($url);
$db = 'excite_NetSearch' if ($source eq 'excite');
&ArchitextQuery'logQuery($db, $search, $logfile);
return unless ($source eq 'excite');
if ($backlink eq '*') {
$backlink = "";
} else {
$backlink = &httpize($backlink);
$backlink = "&backlink=$backlink";
}
if ($linktext eq '*') {
$linktext = "";
} else {
$linktext = &httpize($linktext);
$linktext = "&bltext=$linktext";
}
$search =~ s/\s/\+/g;
if ($mode eq "simple") {
$mode = "Keyword";
} else {
$mode = "Concept";
}
$url = "http://www.excite.com/search.gw?trace=ews1.1&search=$search&searchType=$mode&showqbe=1$backlink$linktext";
print "Location: $url\n\n";
print "
No documents found."; } ##ECO -- specialize message if all words were stopped $errstr = "
Error: All your search words were considered too commonplace. (e.g. a, and, or the) Please try again using less common words." if ($errstr =~ /All query words were stemmed/); $errstr = "success" unless $errstr; $errstr = "success" if ($errstr =~ /^ld\.so/); ##ignore runtime ld warnings $errstr = "summary" if ((($errstr=~/documents found/) || ($errstr eq "success")) && ($query =~ /^\(sum.+\d+\)/)); $errstr = "dump" if (($errstr eq "success") && ($query =~ /^\(dump.+\d+\)/)); return ($errstr, @queryresult); } ## Logs an error. Here, 'log' just means print to STDERR, because most ## httpds redirect STDERR to error_log. sub log_error { local($logmsg) = @_; local($time, $myname); ## Get the time chop($time = &main'ctime(time)); ## And get the name of the script currently running ($myname) = ($0 =~ m|/([^/]+)$|); print STDERR "[$time] $myname: $logmsg\n"; } ## simply outputs the summary on to the page with paragraph breaks ## between pages. optionally truncates the summary to a certain ## amount of bytes if that has been specified sub SummaryOutput { local($title, $url, *query) = @_; local($line, $total_length, $line_length); local($original_link); if ($summary_link_mode) { $original_link = "for $title" unless (($title eq '*') || ($url eq '*')); } print "
\n"; for $line (@query) { $line =~ s/~~~/
/g; $line_length = length($line); if ($maximum_summary_length && (($total_length + $line_length) > $maximum_summary_length)) { $line = substr($line, 0, ($maximum_summary_length - $total_length)); print "$line. . ."; last; } $total_length += $line_length; print $line; } } ## if not in $restrict_under_html_root mode, simply output the doc, one ## line at a time sub DocumentOutput { local(*query) = @_; print "
";
for $line (@query){
print "$line\n";
}
print "";
}
## this function takes a line of MakeQuery output, and returns
## an HTML'ized line of output
sub processResultLine {
local($line, $qbeimg, $sumimg, %form) = @_;
local($shown, $max, $ballsrc, $path, $result_line, $special_summary);
## Some perl code to transform a filename to a URL. The URL is in
## $_. A likely choice would be something like
## 's|^/usr/local/www/html/|http:/|'.
$urledit = $form{'urledit'};
$thresh = $form{'thresh'};
$format = $form{'format'};
$special_format = $form{'special'};
$numtoshow = $form{'num'};
$scorefun = $form{'scorefun'};
local($num, $score, $ball, $url, $title, $special_summary) = split(/\t/, $line);
## what happen here???
return 0 unless ($line =~ /\S/);
return 0 if (($num =~ /\D/) || ($score =~ /\D/));
$path = $url;
if ($title =~ /No title/) {
$url =~ /\/?([^\/]+)$/;
$title = "$1 ";
}
if ((!$title) || ($title =~ /Untitled/)) {
$title = $url;
$title =~ /\/?([^\/]+)$/;
## when the html file has no title, or if it is a text file,
## we just want to print out the filename (minus the path)
$title = "$1 ";
}
$title =~ s/^\s+//g;
$title =~ s/\s+$//g;
if ($scorefun eq 'scale') {
## Just multiply it by 100000.
$score *= 100000;
$score = sprintf("%05.0f", $score);
} elsif ($scorefun eq 'log') {
## log-based function that gives a smaller range of
## results.
$score = 10 + log($score) unless ($score == 0);
$score = 0 if $score < 0;
$score = sprintf("%04.3f", $score);
} elsif ($scorefun eq 'nothing') {
$score = sprintf("%02d%", $score); ##ECO
}
$max = $score unless $max;
##last if ($score < $max*$thresh);
$url = &addDumpOperator($url, $num, $form{'db'}, $form{'hroot'})
if (! $restrict_beneath_document_root);
## eval the URL.
if ($urledit) {
eval '$_=$url;' . $urledit;
$url = $_ unless $@;
}
$ourl = $url;
$url = &modifyAnchor($url);
$ballsrc =
"";
}
## collects the document numbers and relevance scores
## into a form to be used for auto subject grouping
sub PrepareGather {
local(*query) = @_;
local($line, $title, $order);
$name = "docs" unless $name;
$order = 0;
$args = "";
for $line (@query) {
($num, $score, $ball, $url, $title) = split(/\t/, $line);
if (($num !~ /\D/) && ($ball !~ /\D/)) {
$args .= "$num|$ball|$order ";
$order++;
last if (!$show_additional_docs_in_grouping &&
($order >= $max_docs_to_return));
}
}
chop($args); # Kill trailing ' '
return $args;
}
sub MakeGather {
local(%form) = @_;
local($queryprog, $configfile, $docstr, $timeout);
local(*QUERY, $errstr, @queryresult, @docs, @udocs);
local($groupnum, %groupwords, %grouparts, %titles, %colors, $num, $query);
local(%groupcount);
local(%rels);
local(%newgrouparts);
local(%urls, %second_urls);
local(%second_grouparts, %second_titles);
local(%summaries, %second_summaries);
local($pastbest, $totalarts);
local($gopen_ball, $ballsrc, $ndoc, $bdoc, $adoc, $qbe, $sum);
local($qbeimg, $sumimg);
local($result_line, $path, $sum_line);
$queryprog = $form{'binary'};
$configfile = "$form{'root'}/collections/$form{'db'}.cf";
$docstr = $form{'docs'};
## this bit of code figures out whether to use icons or
## text for the summary and qbe URLs
if ($icons{'qbe'}) {
$qbeimg =
qq();
} else {
$qbeimg = "Q";
}
if ($icons{'sum'}) {
$sumimg =
qq(
);
} else {
$sumimg = "(summary)";
}
## Make sure all the files are in place.
if (! &executable($queryprog)) {
&log_error("query program '$queryprog' does not exist " .
"or is not executable");
return &html_errorstr("Cannot run query program '$queryprog'");
}
if (!$configfile || ! -r $configfile) {
&log_error("cannot find config file '$configfile'");
return &html_errorstr("Cannot find configuration file");
}
if ($docstr eq '1') {
&log_error("You cannot do a subject-grouping on zero documents.");
return "You cannot do a subject-grouping on zero documents.";
}
## split docs and colored ball information into separate arrays
@udocs = split(' ', $docstr);
while ($adoc = pop(@udocs)) {
($ndoc, $bdoc, $rdoc) = split(/\|/, $adoc);
$colors{$ndoc} = $bdoc;
$rels{$ndoc} = $rdoc;
unshift(@docs, $ndoc);
}
if (!@docs) {
&log_error("No articles to gather");
return &html_errorstr("Invalid gather request");
}
$query = $form{'gather_cmd'};
$query =
"(g $gather_options g=$number_of_subject_groups (. DOCS))"
unless $query;
$query =~ s/DOCS/@docs/g;
$format_gopen = $form{'format_gopen'} ? $form{'format_gopen'} :
"%b Group %g: %w
$_\n";
}
}
##end patch RAM 1/13/98
if ($errstr =~ /\S/) {
return $errstr unless ($errstr =~ /^ld\.so/); ## ignore ld runtime
}
## code here will throw relevance 1 and 2 documents from
## the less-central document list in to the final
## output list
for $g (keys %second_grouparts) {
for $num (split(/\s+/, $second_grouparts{$g})) {
if ($colors{$num} == 1 || $colors{$num} == 2) {
$grouparts{$g} = "$num $grouparts{$g}";
$titles{$num} = $second_titles{$num};
$urls{$num} = $second_urls{$num};
$summaries{$num} = $second_summaries{$num};
}
}
}
## this code sorts the subgroups based on the relevance in the
## original query
for $g (keys %grouparts) {
for $h (sort { $rels{$b} <=> $rels{$a} } split(' ', $grouparts{$g})) {
$newgrouparts{$g} = "$h $newgrouparts{$g}";
}
}
%grouparts = %newgrouparts;
$groupnum = 1;
for $g (sort { length($grouparts{$b}) <=> length($grouparts{$a}) }
keys %grouparts) {
print &formatExpand($format_gopen,
'g', $groupnum,
'w', $groupwords{$g},
'b', $gopen_ball);
$grouparts{$g} =~ s/\s+$//;
$total_for_group = 0;
for $num (split(/\s+/, $grouparts{$g})) {
$total_for_group++;
## code here limits number of documents in a sub group
if ($total_for_group > $max_docs_per_subgroup) {
last unless ($rels{$num} > $max_docs_to_return);
}
$url = $urls{$num};
$path = $url;
$url = &addDumpOperator($url, $num, $form{'db'}, $form{'hroot'})
if (! $restrict_beneath_document_root);
if ($urledit) {
eval '$_=$url;' . $urledit;
$url = $_ unless $@;
}
$ourl = $url;
$url = &modifyAnchor($url);
## figure out the appropriate colored ball to put here
$ballsrc =
" Note: Subject Grouping this particular set of query results
has yielded a number of small groups. This
indicates either a small document collection on this server,
a small set of query results, or a
set of results that does not contain many subtopics.
EOF
;
}
return "success";
}
## checks for less-than-deal subject grouping results
sub goodGrouping {
local($numarticles, %groups) = @_;
local($g, $numgood);
if (! $advise_on_gather) { return 1; }
$numgood = 0;
for $g (keys %groups) {
$val = $groups{$g} / $numarticles;
if (($groups{$g} / $numarticles) > 0.04 ) {
$numgood++;
}
}
if ($numgood > 1) { return 1; }
return 0;
}
sub formatExpand {
local($format, %fmt) = @_;
local($str) = $format;
$str =~ s/\%\%/\377/g;
for (keys %fmt) {
$str =~ s/\%$_/$fmt{$_}/g;
}
$str =~ s/\377/%/g;
$str;
}
## This function takes arguments in standard ";
if (! $graphic_relevance_mode) {
$ballsrc = "-";
$ballsrc = "+" if ($colors{$num} == 1);
}
$qbeimg = $ballsrc;
##calculate summary and QBE URLs
if ($query_by_example_mode) {
$spage = "searchpage=$form{'searchpage'}"
if $form{'searchpage'};
$spage=&httpize($spage);
$qbe = "$qbeimg";
}
$qbe = $qbeimg unless $qbe;
$titles{$num} =~ s/^\s+//g;
$titles{$num} =~ s/\s+$//g;
if ($summary_mode) {
if ($summary_link_mode) {
$sum_title_info = "&stitle=$titles{$num}&surl=$ourl";
$sum_title_info = &httpize($sum_title_info);
}
$sum = "$sumimg";
}
$summaries{$num} = "" unless $inline_summaries;
if (! $summaries{$num}) {
$result_line = &formatExpand($format_gentry,
'g', $groupnum,
'n', $num,
'w', $groupwords{$g},
't', $titles{$num},
'u', $url,
'b', $qbe,
's', $sum);
} else {
$result_line = &formatExpand($format_gentry_special,
'g', $groupnum,
'n', $num,
'w', $groupwords{$g},
't', $titles{$num},
'u', $url,
'b', $qbe,
's', $summaries{$num});
}
if ($customize_result_list) {
$qbe = "(empty)" unless $qbe;
$sum = "(empty)" unless $sum;
$sum = $summaries{$num} if $summaries{$num};
$result_line = &customize_grouping_line($form{'db'}, $path,
$form{'hroot'},
$qbe,
$titles{$num},
$sum,
$result_line);
}
##make sure URLs all have forward slashes
$result_line =~ s/\\/\//g;
print $result_line;
}
print &formatExpand($format_gclose,
'g', $groupnum,
'w', $groupwords{$g});
$groupnum++;
}
## check if this was a good group of articles to group, if not
## advise the user.
if (! &goodGrouping($totalarts, %groupcount)) {
print <
No documents found.') || ($errstr =~ /No documents found/)); print "
No documents found.
" if ($errstr =~ /No documents found/);
print $errstr unless ($errstr =~ /No documents found/) ;
exit(0) unless ($errstr =~ /No documents found/);
## VL - added this to return null if everything is OK
## may be OK to remove exit(0) as well
return "";
}
## tells the user what his query was on the result page, if
## appropriate.
sub showSearchString {
local(%sform) = @_;
local($search);
$search = $sform{'psearch'};
$search = "(Query-By-Example)" if $sform{'doc'};
##$search =~ s|\W| |g;
print
"\n
excite for web servers found documents about: $search\n"
unless (($sform{'sum'}) || ($sform{'dump'}));
&callbackButton($sform{'aurl'}, $sform{'searchpage'});
&explainResults($sform{'aurl'});
print "
";
}
## ECO -- explains the icons on the results list
sub explainResults {
local($aurl) = @_;
local($qbe, $summary, $describe);
$qbe = "; click icons to find similar documents" if ($query_by_example_mode && $graphic_relevance_mode);
$qbe = "; click + or - to find similar documents" if ($query_by_example_mode && !$graphic_relevance_mode);
$summary = "; click (summary) for a short summary of the document"
if ($summary_mode && !$inline_summaries);
if ($show_legend) {
print <
- higher confidence,
- lower confidence$qbe$summary
EOF
;
}
}
##ECO -- returns 'confidence' or 'subject' based on value of
##grouped by radiobuttons or original search form
sub getSearchMode {
local(%form) = @_;
local($xcoord);
return 'confidence' unless $form{'groupby.x'};
$xcoord = $form{'groupby.x'};
$xcoord =~ s/\D//g;
return 'subject' if (($xcoord > 225) && ($form{'smode'} eq 'confidence'));
return 'subject' if (($xcoord > 194) && ($form{'smode'} eq 'subject'));
return 'confidence';
}
## ECO -- shows the appropriate gif based on subject or confidence grouping
sub showSearchMode {
local($mode, %form) = @_;
local($aurl, $orig, $searchpage, $docs, $qbe);
local($image);
$aurl = $form{'aurl'};
$orig = $form{'search'};
$searchpage = $form{'searchpage'};
$docs = $form{'docs'};
$qbe = $form{'doc'};
return unless $subject_group_mode;
$image = $confidence_image if ($mode eq 'confidence');
$image = $subject_image if ($mode eq 'subject');
$qbe = "" if $qbe;
print <
);
}
}
##this routine allows the query script to generate appropriate search
##strings for concept queries, query-by-example, and summaries.
sub setSearchString {
local(%sform) = @_;
local($search, $docnum);
if ($sform{'doc'}) {
$docnum = $sform{'doc'};
$docnum =~ s|\D||g;
$search = "(concept (doc r=$docnum))"; }
elsif ($sform{'sum'}) {
$docnum = $sform{'sum'};
$docnum =~ s|\D||g;
$search = "(sum sroot=$sform{'root'}/collections/summary n=$number_of_summary_sentences sep=~~~ $docnum)"; }
elsif ($sform{'dump'}) {
$docnum = $sform{'dump'};
$docnum =~ s|\D||g;
$search = "(dump title=0 hdr=1 $docnum)";
}
else {
$search = $sform{'search'};
}
## this mode means we should return a boolean search string
if ($sform{'mode'} eq 'simple') {
$search =~ s|\W| |g;
$search =~ s|^\s+||g;
$search =~ s|\s+$||g;
$search =~ s|\s+| |g;
@words = split(/\s/, $search);
if ($#words <= 0) {
$search = "(s @words)";
}
else {
$search = "(c @words (. (& ";
for $word (@words) {
$search .= "(sb ifstem=all $word) ";
}
$search .= ")))";
}
} else {
## checks for stemming by default mode
if ($stem_by_default && ($search !~ /^\(/)) {
$search = join('$ ', split(/\s+/, $search));
$search .= "\$";
$search =~ s/AND\$/AND/g;
$search =~ s/OR\$/OR/g;
$search =~ s/NOT\$/NOT/g;
}
}
$search =~ s|\s+| |g;
return $search;
}
## make any changes to the anchor that are necessary. i.e, add
## shttp attributes or something like that.
sub modifyAnchor {
local($url) = @_;
return ("");
}
sub addDumpOperator {
local($file, $docnum, $db, $htmlroot) = @_;
local($suffix) = $script_suffix;
if (&underRoot($htmlroot, $file)) {
return $file;
} else {
return "AT-${db}search$suffix?dump=d$docnum";
}
}
## returns true if a $file is beneath $root directory
sub underRoot {
local($root, $file) = @_;
local($prefix);
$root =~ s|\/$|| if ($root =~ /\/$/);
$prefix = substr($file, 0, length($root));
return 1 if ($prefix eq $root);
return 0;
}
sub footer {
local($aurl) = @_;
if ($powered_by_excite) {
## NOTE: according to the license agreement, you are not to
## remove this link if you are using a free version of EWS unless
## you have purchased a maintenance agreement.
print <
\n);
$form{'num'} = 9999999; ## list everything
$form{'scorefun'} = "nothing";
$form{'urledit'} = $urledit;
## these don't matter
$qbeimg = "Q";
$sumimg = "(summary)";
## open map and index data files
$croot = $attr{'CollectionRoot'};
if (!open(IDX, ">$croot.url.idx")) {
print "couldn't open auxiliary file for notification";
exit -1;
}
if (!open(MAP, ">$croot.url.map")) {
print "couldn't open auxiliary file for notification";
exit -1;
}
## call the search executable with -dump flag to get a list of all
## documents in a form that can be passed to processResultLine
$searchExecutable = $attr{'SearchExecutable'};
## convert for ports
$searchExecutable = &main'convert_file_names($searchExecutable);
open(DUMP, "$searchExecutable -R $root/collections/$db -dump|");
while (