Commit 3e2850a5 authored by Fred Drake's avatar Fred Drake

New conversion tools for HTML->info from Michael Ernst

<mernst@cs.washington.edu>.

Thanks!
parent 94131d39
#!/uns/bin/perl
package checkargs;
require 5.004; # uses "for my $var"
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(check_args check_args_range check_args_at_least);
use strict;
use Carp;
=head1 NAME
checkargs -- Provide rudimentary argument checking for perl5 functions
=head1 SYNOPSIS
check_args(cArgsExpected, @_)
check_args_range(cArgsMin, cArgsMax, @_)
check_args_at_least(cArgsMin, @_)
where "@_" should be supplied literally.
=head1 DESCRIPTION
As the first line of user-written subroutine foo, do one of the following:
my ($arg1, $arg2) = check_args(2, @_);
my ($arg1, @rest) = check_args_range(1, 4, @_);
my ($arg1, @rest) = check_args_at_least(1, @_);
my @args = check_args_at_least(0, @_);
These functions may also be called for side effect (put a call to one
of the functions near the beginning of the subroutine), but using the
argument checkers to set the argument list is the recommended usage.
The number of arguments and their definedness are checked; if the wrong
number are received, the program exits with an error message.
=head1 AUTHOR
Michael D. Ernst <F<mernst@cs.washington.edu>>
=cut
## Need to check that use of caller(1) really gives desired results.
## Need to give input chunk information.
## Is this obviated by Perl 5.003's declarations? Not entirely, I think.
sub check_args ( $@ )
{
my ($num_formals, @args) = @_;
my ($pack, $file_arg, $line_arg, $subname, $hasargs, $wantarr) = caller(1);
if (@_ < 1) { croak "check_args needs at least 7 args, got ", scalar(@_), ": @_\n "; }
if ((!wantarray) && ($num_formals != 0))
{ croak "check_args called in scalar context"; }
# Can't use croak below here: it would only go out to caller, not its caller
my $num_actuals = @args;
if ($num_actuals != $num_formals)
{ die "$file_arg:$line_arg: function $subname expected $num_formals argument",
(($num_formals == 1) ? "" : "s"),
", got $num_actuals",
(($num_actuals == 0) ? "" : ": @args"),
"\n"; }
for my $index (0..$#args)
{ if (!defined($args[$index]))
{ die "$file_arg:$line_arg: function $subname undefined argument ", $index+1, ": @args[0..$index-1]\n"; } }
return @args;
}
sub check_args_range ( $$@ )
{
my ($min_formals, $max_formals, @args) = @_;
my ($pack, $file_arg, $line_arg, $subname, $hasargs, $wantarr) = caller(1);
if (@_ < 2) { croak "check_args_range needs at least 8 args, got ", scalar(@_), ": @_"; }
if ((!wantarray) && ($max_formals != 0) && ($min_formals !=0) )
{ croak "check_args_range called in scalar context"; }
# Can't use croak below here: it would only go out to caller, not its caller
my $num_actuals = @args;
if (($num_actuals < $min_formals) || ($num_actuals > $max_formals))
{ die "$file_arg:$line_arg: function $subname expected $min_formals-$max_formals arguments, got $num_actuals",
($num_actuals == 0) ? "" : ": @args", "\n"; }
for my $index (0..$#args)
{ if (!defined($args[$index]))
{ die "$file_arg:$line_arg: function $subname undefined argument ", $index+1, ": @args[0..$index-1]\n"; } }
return @args;
}
sub check_args_at_least ( $@ )
{
my ($min_formals, @args) = @_;
my ($pack, $file_arg, $line_arg, $subname, $hasargs, $wantarr) = caller(1);
# Don't do this, because we want every sub to start with a call to check_args*
# if ($min_formals == 0)
# { die "Isn't it pointless to check for at least zero args to $subname?\n"; }
if (scalar(@_) < 1)
{ croak "check_args_at_least needs at least 1 arg, got ", scalar(@_), ": @_"; }
if ((!wantarray) && ($min_formals != 0))
{ croak "check_args_at_least called in scalar context"; }
# Can't use croak below here: it would only go out to caller, not its caller
my $num_actuals = @args;
if ($num_actuals < $min_formals)
{ die "$file_arg:$line_arg: function $subname expected at least $min_formals argument",
($min_formals == 1) ? "" : "s",
", got $num_actuals",
($num_actuals == 0) ? "" : ": @args", "\n"; }
for my $index (0..$#args)
{ if (!defined($args[$index]))
{ warn "$file_arg:$line_arg: function $subname undefined argument ", $index+1, ": @args[0..$index-1]\n"; last; } }
return @args;
}
1; # successful import
__END__
#! /usr/bin/env perl -w
# html2texi.pl -- Convert HTML documentation to Texinfo format
# Michael Ernst <mernst@cs.washington.edu>
# Time-stamp: <1998-09-10 12:52:38 mernst>
# This program converts HTML documentation trees into Texinfo format.
# Given the name of a main (or contents) HTML file, it processes that file,
# and other files (transitively) referenced by it, into a Texinfo file
# (whose name is chosen from the file or directory name of the argument).
# For instance:
# html2texi.pl api/index.pl
# produces file "api.texi".
# Texinfo format can be easily converted to Info format (for browsing in
# Emacs or the standalone Info browser), to a printed manual, or to HTML.
# Thus, html2texi.pl permits conversion of HTML files to Info format, and
# secondarily enables producing printed versions of Web page hierarchies.
# Unlike HTML, Info format is searchable. Since Info is integrated into
# Emacs, one can read documentation without starting a separate Web
# browser. Additionally, Info browsers (including Emacs) contain
# convenient features missing from Web browsers, such as easy index lookup
# and mouse-free browsing.
# Limitations:
# html2texi.pl is currently tuned to latex2html output, but should be
# extensible to arbitrary HTML documents. It will be most useful for HTML
# with a hierarchical structure and an index. The HTML tree to be
# traversed must be on local disk, rather than being accessed via HTTP.
# This script requires the use of "checkargs.pm". To eliminate that
# dependence, replace calls to check_args* by @_ (which is always the last
# argument to those functions).
# Also see the "to do" section, below.
# Comments, suggestions, bug fixes, and enhancements are welcome.
###
### Typical usage for the Python documentation:
###
# (Actually, most of this is in a Makefile instead.)
# The resulting Info format Python documentation is currently available at
# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
# Fix up HTML problems, eg <DL COMPACT><DD>
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
# Edit the generated .texi files:
# * change @setfilename to prefix "python-"
# * fix up any sectioning, such as for Abstract
# * make Texinfo menus
# * perhaps remove the @detailmenu ... @end detailmenu
# In Emacs:
# (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
# makeinfo api.texi
# makeinfo ext.texi
# makeinfo lib.texi
# makeinfo mac.texi
# makeinfo ref.texi
# makeinfo tut.texi
###
### Structure of the code
###
# To be written...
###
### Design decisions
###
# Source and destination languages
# --------------------------------
#
# The goal is Info files; I create Texinfo, so I don't have to worry about
# the finer details of Info file creation. (I'm not even sure of its exact
# format.)
#
# Why not start from LaTeX rather than HTML?
# I could hack latex2html itself to produce Texinfo instead, or fix up
# partparse.py (which already translates LaTeX to Teinfo).
# Pros:
# * has high-level information such as index entries, original formatting
# Cons:
# * those programs are complicated to read and understand
# * those programs try to handle arbitrary LaTeX input, track catcodes,
# and more: I don't want to go to that effort. HTML isn't as powerful
# as LaTeX, so there are fewer subtleties.
# * the result wouldn't work for arbitrary HTML documents; it would be
# nice to eventually extend this program to HTML produced from Docbook,
# Frame, and more.
# Parsing
# -------
#
# I don't want to view the text as a linear stream; I'd rather parse the
# whole thing and then do pattern matching over the parsed representation (to
# find idioms such as indices, lists of child nodes, etc.).
# * Perl provides HTML::TreeBuilder, which does just what I want.
# * libwww-perl: http://www.linpro.no/lwp/
# * TreeBuilder: HTML-Tree-0.51.tar.gz
# * Python Parsers, Formatters, and Writers don't really provide the right
# interface (and the version in Grail doesn't correspond to another
# distributed version, so I'm confused about which to be using). I could
# write something in Python that creates a parse tree, but why bother?
# Other implementation language issues:
# * Python lacks variable declarations, reasonable scoping, and static
# checking tools. I've written some of the latter for myself that make
# my Perl programming a lot safer than my Python programming will be until
# I have a similar suite for that language.
###########################################################################
### To do
###
# Section names:
# Fix the problem with multiple sections in a single file (eg, Abstract in
# Front Matter section).
# Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
# Index:
# Perhaps double-check that every tag mentioned in the index is found
# in the text.
# Python: email to python-docs@python.org, to get their feedback.
# Compare to existing lib/ Info manual
# Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
# Postpass to remove extra quotation marks around typography already in
# a different font (to avoid double delimiters as in "`code'"); or
# perhaps consider using only font-based markup so that we don't get
# the extra *bold* and `code' markup in Info.
## Perhaps don't rely on automatic means for adding up, next, prev; I have
## all that info available to me already, so it's not so much trouble to
## add it. (Right?) But it is *so* easy to use Emacs instead...
###########################################################################
### Strictures
###
# man HTML::TreeBuilder
# man HTML::Parser
# man HTML::Element
# require HTML::ParserWComment;
require HTML::Parser;
require HTML::TreeBuilder;
require HTML::Element;
use File::Basename;
use Cwd;
use strict;
# use Carp;
use checkargs;
###########################################################################
### Variables
###
my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think)
my $current_ref_tdf; # for the file currently being processed;
# used in error messages
my $html_directory;
my %footnotes;
# First element should not be used.
my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");
my %inline_markup = ("b" => "strong",
"code" => "code",
"i" => "emph",
"kbd" => "kbd",
"samp" => "samp",
"strong" => "strong",
"tt" => "code",
"var" => "var");
my @deferred_index_entries = ();
my @index_titles = (); # list of (filename, type) lists
my %index_info = ("Index" => ["\@blindex", "bl"],
"Concept Index" => ["\@cindex", "cp"],
"Module Index" => ["\@mdindex", "md"]);
###########################################################################
### Main/contents page
###
# Process first-level page on its own, or just a contents page? Well, I do
# want the title, author, etc., and the front matter... For now, just add
# that by hand at the end.
# data structure possibilities:
# * tree-like (need some kind of stack when processing (or parent pointers))
# * list of name and depth; remember old and new depths.
# Each element is a reference to a list of (nodetitle, depth, filename).
my @contents_list = ();
# The problem with doing fixups on the fly is that some sections may have
# already been processed (and no longer available) by the time we notice
# others with the same name. It's probably better to fully construct the
# contents list (reading in all files of interest) upfront; that will also
# let me do a better job with cross-references, because again, all files
# will already be read in.
my %contents_hash = ();
my %contents_fixups = ();
my @current_contents_list = ();
# Merge @current_contents_list into @contents_list,
# and set @current_contents_list to be empty.
sub merge_contents_lists ( )
{ check_args(0, @_);
# Three possibilities:
# * @contents_list is empty: replace it by @current_contents_list.
# * prefixes of the two lists are identical: do nothing
# * @current_contents_list is all at lower level than $contents_list[0];
# prefix @contents_list by @current_contents_list
if (scalar(@current_contents_list) == 0)
{ die "empty current_contents_list"; }
# if (scalar(@contents_list) == 0)
# { @contents_list = @current_contents_list;
# @current_contents_list = ();
# return; }
# if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
# { unshift @contents_list, @current_contents_list;
# @current_contents_list = ();
# return; }
for (my $i=0; $i<scalar(@current_contents_list); $i++)
{ my $ref_c_tdf = $current_contents_list[$i];
if ($i >= scalar(@contents_list))
{ push @contents_list, $ref_c_tdf;
my $title = $ {$ref_c_tdf}[0];
if (defined $contents_hash{$title})
{ $contents_fixups{$title} = 1; }
else
{ $contents_hash{$title} = 1; }
next; }
my $ref_tdf = $contents_list[$i];
my ($title, $depth, $file) = @{$ref_tdf};
my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};
if (($title ne $c_title)
&& ($depth < $c_depth)
&& ($file ne $c_file))
{ splice @contents_list, $i, 0, $ref_c_tdf;
if (defined $contents_hash{$c_title})
{ $contents_fixups{$c_title} = 1; }
else
{ $contents_hash{$c_title} = 1; }
next; }
if (($title ne $c_title)
|| ($depth != $c_depth)
|| ($file ne $c_file))
{ die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
"\n main: <<<$title>>> $depth $file",
"\n curr: <<<$c_title>>> $c_depth $c_file"); }
}
@current_contents_list = ();
}
# Set @current_contents_list to a list of (title, href, sectionlevel);
# then merge that list into @contents_list.
# Maybe this function should also produce a map
# from title (or href) to sectionlevel (eg "chapter"?).
sub process_child_links ( $ )
{ my ($he) = check_args(1, @_);
# $he->dump;
if (scalar(@current_contents_list) != 0)
{ die "current_contents_list nonempty: @current_contents_list"; }
$he->traverse(\&increment_current_contents_list, 'ignore text');
# Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
my %depths = ();
for my $ref_tdf (@current_contents_list)
{ $depths{$ {$ref_tdf}[1]} = 1; }
my @sorted_depths = sort keys %depths;
my $current_depth = scalar(@section_stack)-1;
my $current_depth_2 = $ {$current_ref_tdf}[1];
if ($current_depth != $current_depth_2)
{ die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
for (my $i=0; $i<scalar(@sorted_depths); $i++)
{ $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
for my $ref_tdf (@current_contents_list)
{ $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }
# Eliminate uninteresting sections. Hard-coded hack for now.
if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
{ pop @current_contents_list; }
if ((scalar(@current_contents_list) > 1)
&& ($ {$current_contents_list[1]}[0] eq "Contents"))
{ my $ref_first_tdf = shift @current_contents_list;
$current_contents_list[0] = $ref_first_tdf; }
for (my $i=0; $i<scalar(@current_contents_list); $i++)
{ my $ref_tdf = $current_contents_list[$i];
my $title = $ {$ref_tdf}[0];
if (exists $index_info{$title})
{ my $index_file = $ {$ref_tdf}[2];
my ($indexing_command, $suffix) = @{$index_info{$title}};
process_index_file($index_file, $indexing_command);
print TEXI "\n\@defindex $suffix\n";
push @index_titles, $title;
splice @current_contents_list, $i, 1;
$i--; }
elsif ($title =~ /\bIndex$/)
{ print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }
merge_contents_lists();
# print_contents_list();
# print_index_info();
}
sub increment_current_contents_list ( $$$ )
{ my ($he, $startflag, $depth) = check_args(3, @_);
if (!$startflag)
{ return; }
if ($he->tag eq "li")
{ my @li_content = @{$he->content};
if ($li_content[0]->tag ne "a")
{ die "first element of <LI> should be <A>"; }
my ($name, $href, @content) = anchor_info($li_content[0]);
# unused $name
my $title = join("", collect_texts($li_content[0]));
$title = texi_remove_punctuation($title);
# The problem with these is that they are formatted differently in
# @menu and @node!
$title =~ s/``/\"/g;
$title =~ s/''/\"/g;
$title =~ s/ -- / /g;
push @current_contents_list, [ $title, $depth, $href ]; }
return 1;
}
# Simple version for section titles
sub html_to_texi ( $ )
{ my ($he) = check_args(1, @_);
if (!ref $he)
{ return $he; }
my $tag = $he->tag;
if (exists $inline_markup{$tag})
{ my $result = "\@$inline_markup{$tag}\{";
for my $elt (@{$he->content})
{ $result .= html_to_texi($elt); }
$result .= "\}";
return $result; }
else
{ $he->dump;
die "html_to_texi confused by <$tag>"; }
}
sub print_contents_list ()
{ check_args(0, @_);
print STDERR "Contents list:\n";
for my $ref_tdf (@contents_list)
{ my ($title, $depth, $file) = @{$ref_tdf};
print STDERR "$title $depth $file\n"; }
}
###########################################################################
### Index
###
my $l2h_broken_link_name = "l2h-";
# map from file to (map from anchor name to (list of index texts))
# (The list is needed when a single LaTeX command like \envvar
# expands to multiple \index commands.)
my %file_index_entries = ();
my %this_index_entries; # map from anchor name to (list of index texts)
my %file_index_entries_broken = (); # map from file to (list of index texts)
my @this_index_entries_broken;
my $index_prefix = "";
my @index_prefixes = ();
my $this_indexing_command;
sub print_index_info ()
{ check_args(0, @_);
my ($key, $val);
for my $file (sort keys %file_index_entries)
{ my %index_entries = %{$file_index_entries{$file}};
print STDERR "file: $file\n";
for my $aname (sort keys %index_entries)
{ my @entries = @{$index_entries{$aname}};
if (scalar(@entries) == 1)
{ print STDERR " $aname : $entries[0]\n"; }
else
{ print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } }
for my $file (sort keys %file_index_entries_broken)
{ my @entries = @{$file_index_entries_broken{$file}};
print STDERR "file: $file\n";
for my $entry (@entries)
{ print STDERR " $entry\n"; }
}
}
sub process_index_file ( $$ )
{ my ($file, $indexing_command) = check_args(2, @_);
# print "process_index_file $file $indexing_command\n";
my $he = file_to_tree($html_directory . $file);
# $he->dump();
$this_indexing_command = $indexing_command;
$he->traverse(\&process_if_index_dl_compact, 'ignore text');
undef $this_indexing_command;
# print "process_index_file done\n";
}
sub process_if_index_dl_compact ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if (($he->tag() eq "dl") && (defined $he->attr('compact')))
{ process_index_dl_compact($he);
return 0; }
else
{ return 1; }
}
# The elements of a <DL COMPACT> list from a LaTeX2HTML index:
# * a single space: text to be ignored
# * <DT> elements with an optional <DD> element following each one
# Two types of <DT> elements:
# * Followed by a <DD> element: the <DT> contains a single
# string, and the <DD> contains a whitespace string to be ignored, a
# <DL COMPACT> to be recursively processed (with the <DT> string as a
# prefix), and a whitespace string to be ignored.
# * Not followed by a <DD> element: contains a list of anchors
# and texts (ignore the texts, which are only whitespace and commas).
# Optionally contains a <DL COMPACT> to be recursively processed (with
# the <DT> string as a prefix)
sub process_index_dl_compact ( $ )
{ my ($h) = check_args(1, @_);
my @content = @{$h->content()};
for (my $i = 0; $i < scalar(@content); $i++)
{ my $this_he = $content[$i];
if ($this_he->tag ne "dt")
{ $this_he->dump;
die "Expected <DT> tag: " . $this_he->tag; }
if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
{ process_index_dt_and_dd($this_he, $content[$i+1]);
$i++; }
else
{ process_index_lone_dt($this_he); } } }
# Argument is a <DT> element. If it contains more than one anchor, then
# the texts of all subsequent ones are "[Link]". Example:
# <DT>
# <A HREF="embedding.html#l2h-201">
# "$PATH"
# ", "
# <A HREF="embedding.html#l2h-205">
# "[Link]"
# Optionally contains a <DL COMPACT> as well. Example:
# <DT>
# <A HREF="types.html#l2h-616">
# "attribute"
# <DL COMPACT>
# <DT>
# <A HREF="assignment.html#l2h-3074">
# "assignment"
# ", "
# <A HREF="assignment.html#l2h-3099">
# "[Link]"
# <DT>
# <A HREF="types.html#l2h-">
# "assignment, class"
sub process_index_lone_dt ( $ )
{ my ($dt) = check_args(1, @_);
my @dtcontent = @{$dt->content()};
my $acontent;
my $acontent_suffix;
for my $a (@dtcontent)
{ if ($a eq ", ")
{ next; }
if (!ref $a)
{ $dt->dump;
die "Unexpected <DT> string element: $a"; }
if ($a->tag eq "dl")
{ push @index_prefixes, $index_prefix;
if (!defined $acontent_suffix)
{ die "acontent_suffix not yet defined"; }
$index_prefix .= $acontent_suffix . ", ";
process_index_dl_compact($a);
$index_prefix = pop(@index_prefixes);
return; }
if ($a->tag ne "a")
{ $dt->dump;
$a->dump;
die "Expected anchor in lone <DT>"; }
my ($aname, $ahref, @acontent) = anchor_info($a);
# unused $aname
if (scalar(@acontent) != 1)
{ die "Expected just one content of <A> in <DT>: @acontent"; }
if (ref $acontent[0])
{ $acontent[0]->dump;
die "Expected string content of <A> in <DT>: $acontent[0]"; }
if (!defined($acontent))
{ $acontent = $index_prefix . $acontent[0];
$acontent_suffix = $acontent[0]; }
elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
{ die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
if (!defined $ahref)
{ $dt->dump;
die "no HREF in nachor in <DT>"; }
my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
if (!defined $ahref_name)
{ # Reference to entire file
$ahref_name = ""; }
if ($ahref_name eq $l2h_broken_link_name)
{ if (!exists $file_index_entries_broken{$ahref_file})
{ $file_index_entries_broken{$ahref_file} = []; }
push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
next; }
if (!exists $file_index_entries{$ahref_file})
{ $file_index_entries{$ahref_file} = {}; }
# Don't do this! It appears to make a copy, which is not desired.
# my %index_entries = %{$file_index_entries{$ahref_file}};
if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
{ $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
# { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
# if ($acontent eq $oldcontent)
# { die "Multiple identical index entries?"; }
# die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
# print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
}
}
sub process_index_dt_and_dd ( $$ )
{ my ($dt, $dd) = check_args(2, @_);
my $dtcontent;
{ my @dtcontent = @{$dt->content()};
if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
{ $dd->dump;
$dt->dump;
die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
$dtcontent = $dtcontent[0];
$dtcontent =~ s/ +$//; }
my $ddcontent;
{ my @ddcontent = @{$dd->content()};
if (scalar(@ddcontent) != 1)
{ die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
$ddcontent = $ddcontent[0]; }
if ($ddcontent->tag ne "dl")
{ die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }
push @index_prefixes, $index_prefix;
$index_prefix .= $dtcontent . ", ";
process_index_dl_compact($ddcontent);
$index_prefix = pop(@index_prefixes);
}
###########################################################################
### Ordinary sections
###
sub process_section_file ( $$$ )
{ my ($file, $depth, $nodetitle) = check_args(3, @_);
my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);
# print STDERR "process_section_file: $file $depth $nodetitle\n";
# Equivalently:
# while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
@section_stack = @section_stack[0..$depth-1];
# Not a great nodename fixup scheme; need a more global view
if ((defined $contents_fixups{$nodetitle})
&& (scalar(@section_stack) > 0))
{ my $up_title = $section_stack[$#section_stack];
# hack for Python Standard Library
$up_title =~ s/^(Built-in|Standard) Module //g;
my ($up_first_word) = split(/ /, $up_title);
$nodetitle = "$up_first_word $nodetitle";
}
push @section_stack, $nodetitle;
# print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
$he->traverse(\&process_if_child_links, 'ignore text');
%footnotes = ();
# $he->dump;
$he->traverse(\&process_if_footnotes, 'ignore text');
# $he->dump;
if (exists $file_index_entries{$file})
{ %this_index_entries = %{$file_index_entries{$file}};
# print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
}
else
{ # print STDERR "Warning: no index entries for file $file\n";
%this_index_entries = (); }
if (exists $file_index_entries_broken{$file})
{ @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
else
{ # print STDERR "Warning: no index entries for file $file\n";
@this_index_entries_broken = (); }
if ($he->tag() ne "html")
{ die "Expected <HTML> at top level"; }
my @content = @{$he->content()};
if ((!ref $content[0]) or ($content[0]->tag ne "head"))
{ $he->dump;
die "<HEAD> not first element of <HTML>"; }
if ((!ref $content[1]) or ($content[1]->tag ne "body"))
{ $he->dump;
die "<BODY> not second element of <HTML>"; }
$content[1]->traverse(\&output_body);
}
# stack of things we're inside that are preventing indexing from occurring now.
# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
my @index_deferrers = ();
sub push_or_pop_index_deferrers ( $$ )
{ my ($tag, $startflag) = check_args(2, @_);
if ($startflag)
{ push @index_deferrers, $tag; }
else
{ my $old_deferrer = pop @index_deferrers;
if ($tag ne $old_deferrer)
{ die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
do_deferred_index_entries(); }
}
sub label_add_index_entries ( $;$ )
{ my ($label, $he) = check_args_range(1, 2, @_);
# print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
# $he is the anchor element
if (exists $this_index_entries{$label})
{ push @deferred_index_entries, @{$this_index_entries{$label}};
return; }
if ($label eq $l2h_broken_link_name)
{ # Try to find some text to use in guessing which links should point here
# I should probably only look at the previous element, or if that is
# all punctuation, the one before it; collecting all the previous texts
# is a bit of overkill.
my @anchor_texts = collect_texts($he);
my @previous_texts = collect_texts($he->parent, $he);
# 4 elements is arbitrary; ought to filter out punctuation and small words
# first, then perhaps keep fewer. Perhaps also filter out formatting so
# that we can see a larger chunk of text? (Probably not.)
# Also perhaps should do further chunking into words, in case the
# index term isn't a chunk of its own (eg, was in <tt>...</tt>.
my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
my $guessed = 0;
for my $text (@candidate_texts)
{ # my $orig_text = $text;
if ($text =~ /^[\"\`\'().?! ]*$/)
{ next; }
if (length($text) <= 2)
{ next; }
# hack for Python manual; maybe defer until failure first time around?
$text =~ s/^sys\.//g;
for my $iterm (@this_index_entries_broken)
{ # I could test for zero: LaTeX2HTML's failures in the Python
# documentation are only for items of the form "... (built-in...)"
if (index($iterm, $text) != -1)
{ push @deferred_index_entries, $iterm;
# print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
$guessed = 1;
} } }
if (!$guessed)
{ # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
}
}
}
# Need to add calls to this at various places.
# Perhaps add HTML::Element argument and do the check for appropriateness
# here (ie, no action if inside <H1>, etc.).
sub do_deferred_index_entries ()
{ check_args(0, @_);
if ((scalar(@deferred_index_entries) > 0)
&& (scalar(@index_deferrers) == 0))
{ print TEXI "\n", join("\n", @deferred_index_entries), "\n";
@deferred_index_entries = (); }
}
my $table_columns; # undefined if not in a table
my $table_first_column; # boolean
sub output_body ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!ref $he)
{ my $space_index = index($he, " ");
if ($space_index != -1)
{ # Why does
# print TEXI texi_quote(substr($he, 0, $space_index+1));
# give: Can't locate object method "TEXI" via package "texi_quote"
# (Because the definition texi_quote hasn't been seen yet.)
print TEXI &texi_quote(substr($he, 0, $space_index+1));
do_deferred_index_entries();
print TEXI &texi_quote(substr($he, $space_index+1)); }
else
{ print TEXI &texi_quote($he); }
return; }
my $tag = $he->tag();
# Ordinary text markup first
if (exists $inline_markup{$tag})
{ if ($startflag)
{ print TEXI "\@$inline_markup{$tag}\{"; }
else
{ print TEXI "\}"; } }
elsif ($tag eq "a")
{ my ($name, $href, @content) = anchor_info($he);
if (!$href)
{ # This anchor is only here for indexing/cross referencing purposes.
if ($startflag)
{ label_add_index_entries($name, $he); }
}
elsif ($href =~ "^(ftp|http|news):")
{ if ($startflag)
{ # Should avoid second argument if it's identical to the URL.
print TEXI "\@uref\{$href, "; }
else
{ print TEXI "\}"; }
}
elsif ($href =~ /^\#(foot[0-9]+)$/)
{ # Footnote
if ($startflag)
{ # Could double-check name and content, but I'm not
# currently storing that information.
print TEXI "\@footnote\{";
$footnotes{$1}->traverse(\&output_body);
print TEXI "\}";
return 0; } }
else
{ if ($startflag)
{ $he->dump;
warn "Can't deal with internal HREF anchors yet"; }
}
}
elsif ($tag eq "br")
{ print TEXI "\@\n"; }
elsif ($tag eq "body")
{ }
elsif ($tag eq "center")
{ if (has_single_content_string($he)
&& ($ {$he->content}[0] =~ /^ *$/))
{ return 0; }
if ($startflag)
{ print TEXI "\n\@center\n"; }
else
{ print TEXI "\n\@end center\n"; }
}
elsif ($tag eq "div")
{ my $align = $he->attr('align');
if (defined($align) && ($align eq "center"))
{ if (has_single_content_string($he)
&& ($ {$he->content}[0] =~ /^ *$/))
{ return 0; }
if ($startflag)
{ print TEXI "\n\@center\n"; }
else
{ print TEXI "\n\@end center\n"; } }
}
elsif ($tag eq "dl")
{ # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
if (has_single_content_with_tag($he, "dd"))
{ my $he_dd = $ {$he->content}[0];
if (has_single_content_with_tag($he_dd, "pre"))
{ my $he_pre = $ {$he_dd->content}[0];
print_pre($he_pre);
return 0; } }
if ($startflag)
{ # Could examine the elements, to be cleverer about formatting.
# (Also to use ftable, vtable...)
print TEXI "\n\@table \@asis\n"; }
else
{ print TEXI "\n\@end table\n"; }
}
elsif ($tag eq "dt")
{ push_or_pop_index_deferrers($tag, $startflag);
if ($startflag)
{ print TEXI "\n\@item "; }
else
{ } }
elsif ($tag eq "dd")
{ if ($startflag)
{ print TEXI "\n"; }
else
{ }
if (scalar(@index_deferrers) != 0)
{ $he->dump;
die "index deferrers: ", join(" ", @index_deferrers); }
do_deferred_index_entries();
}
elsif ($tag =~ /^(font|big|small)$/)
{ # Do nothing for now.
}
elsif ($tag =~ /^h[1-6]$/)
{ # We don't need this because we never recursively enter the heading content.
# push_or_pop_index_deferrers($tag, $startflag);
my $secname = "";
my @seclabels = ();
for my $elt (@{$he->content})
{ if (!ref $elt)
{ $secname .= $elt; }
elsif ($elt->tag eq "br")
{ }
elsif ($elt->tag eq "a")
{ my ($name, $href, @acontent) = anchor_info($elt);
if ($href)
{ $he->dump;
$elt->dump;
die "Nonsimple anchor in <$tag>"; }
if (!defined $name)
{ die "No NAME for anchor in $tag"; }
push @seclabels, $name;
for my $subelt (@acontent)
{ $secname .= html_to_texi($subelt); } }
else
{ $secname .= html_to_texi($elt); } }
if ($secname eq "")
{ die "No section name in <$tag>"; }
if (scalar(@section_stack) == 1)
{ if ($section_stack[-1] ne "Top")
{ die "Not top? $section_stack[-1]"; }
print TEXI "\@settitle $secname\n";
print TEXI "\@c %**end of header\n";
print TEXI "\n";
print TEXI "\@node Top\n";
print TEXI "\n"; }
else
{ print TEXI "\n\@node $section_stack[-1]\n";
print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
for my $seclabel (@seclabels)
{ label_add_index_entries($seclabel); }
# This should only happen once per file.
label_add_index_entries("");
if (scalar(@index_deferrers) != 0)
{ die "index deferrers: ", join(" ", @index_deferrers); }
do_deferred_index_entries();
return 0;
}
elsif ($tag eq "hr")
{ }
elsif ($tag eq "ignore")
{ # Hack for ignored elements
return 0;
}
elsif ($tag eq "li")
{ if ($startflag)
{ print TEXI "\n\n\@item\n";
do_deferred_index_entries(); } }
elsif ($tag eq "ol")
{ if ($startflag)
{ print TEXI "\n\@enumerate \@bullet\n"; }
else
{ print TEXI "\n\@end enumerate\n"; } }
elsif ($tag eq "p")
{ if ($startflag)
{ print TEXI "\n\n"; }
if (scalar(@index_deferrers) != 0)
{ die "index deferrers: ", join(" ", @index_deferrers); }
do_deferred_index_entries(); }
elsif ($tag eq "pre")
{ print_pre($he);
return 0; }
elsif ($tag eq "table")
{ # Could also indicate common formatting for first column, or
# determine relative widths for columns (or determine a prototype row)
if ($startflag)
{ if (defined $table_columns)
{ $he->dump;
die "Can't deal with table nested inside $table_columns-column table"; }
$table_columns = table_columns($he);
if ($table_columns < 2)
{ $he->dump;
die "Column with $table_columns columns?"; }
elsif ($table_columns == 2)
{ print TEXI "\n\@table \@asis\n"; }
else
{ print TEXI "\n\@multitable \@columnfractions";
for (my $i=0; $i<$table_columns; $i++)
{ print TEXI " ", 1.0/$table_columns; }
print TEXI "\n"; } }
else
{ if ($table_columns == 2)
{ print TEXI "\n\@end table\n"; }
else
{ print TEXI "\n\@end multitable\n"; }
undef $table_columns; } }
elsif (($tag eq "td") || ($tag eq "th"))
{ if ($startflag)
{ if ($table_first_column)
{ print TEXI "\n\@item ";
$table_first_column = 0; }
elsif ($table_columns > 2)
{ print TEXI "\n\@tab "; } }
else
{ print TEXI "\n"; } }
elsif ($tag eq "tr")
{ if ($startflag)
{ $table_first_column = 1; } }
elsif ($tag eq "ul")
{ if ($startflag)
{ print TEXI "\n\@itemize \@bullet\n"; }
else
{ print TEXI "\n\@end itemize\n"; } }
else
{ print STDERR "\nBailing out\n";
$he->dump;
return 0; }
return 1;
}
sub print_pre ( $ )
{ my ($he_pre) = check_args(1, @_);
if (!has_single_content_string($he_pre))
{ die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
my $pre_content = $ {$he_pre->content}[0];
print TEXI "\n\@example";
print TEXI &texi_quote($pre_content);
print TEXI "\@end example\n";
}
sub table_columns ( $ )
{ my ($table) = check_args(1, @_);
my $result = 0;
for my $row (@{$table->content})
{ if ($row->tag ne "tr")
{ $table->dump;
$row->dump;
die "Expected <TR> as table row."; }
$result = max($result, scalar(@{$row->content})); }
return $result;
}
###########################################################################
### Utilities
###
sub min ( $$ )
{ my ($x, $y) = check_args(2, @_);
return ($x < $y) ? $x : $y;
}
sub max ( $$ )
{ my ($x, $y) = check_args(2, @_);
return ($x > $y) ? $x : $y;
}
sub file_to_tree ( $ )
{ my ($file) = check_args(1, @_);
my $tree = new HTML::TreeBuilder;
$tree->ignore_unknown(1);
# $tree->warn(1);
$tree->parse_file($file);
cleanup_parse_tree($tree);
return $tree
}
sub has_single_content ( $ )
{ my ($he) = check_args(1, @_);
if (!ref $he)
{ # return 0;
die "Non-reference argument: $he"; }
my $ref_content = $he->content;
if (!defined $ref_content)
{ return 0; }
my @content = @{$ref_content};
if (scalar(@content) != 1)
{ return 0; }
return 1;
}
# Return true if the content of the element contains only one element itself,
# and that inner element has the specified tag.
sub has_single_content_with_tag ( $$ )
{ my ($he, $tag) = check_args(2, @_);
if (!has_single_content($he))
{ return 0; }
my $content = $ {$he->content}[0];
if (!ref $content)
{ return 0; }
my $content_tag = $content->tag;
if (!defined $content_tag)
{ return 0; }
return $content_tag eq $tag;
}
sub has_single_content_string ( $ )
{ my ($he) = check_args(1, @_);
if (!has_single_content($he))
{ return 0; }
my $content = $ {$he->content}[0];
if (ref $content)
{ return 0; }
return 1;
}
# Return name, href, content. First two may be undefined; third is an array.
# I don't see how to determine if there are more attributes.
sub anchor_info ( $ )
{ my ($he) = check_args(1, @_);
if ($he->tag ne "a")
{ $he->dump;
die "passed non-anchor to anchor_info"; }
my $name = $he->attr('name');
my $href = $he->attr('href');
my @content = ();
{ my $ref_content = $he->content;
if (defined $ref_content)
{ @content = @{$ref_content}; } }
return ($name, $href, @content);
}
sub texi_quote ( $ )
{ my ($text) = check_args(1, @_);
$text =~ s/([\@\{\}])/\@$1/g;
$text =~ s/ -- / --- /g;
return $text;
}
# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
sub texi_remove_punctuation ( $ )
{ my ($text) = check_args(1, @_);
$text =~ s/^ +//g;
$text =~ s/[ :]+$//g;
$text =~ s/^[1-9][0-9.]* +//g;
$text =~ s/,//g;
# Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
# gets converted into " - ", just as "---" would be converted into " -- ",
# so the names end up differing.)
# $text =~ s/:/ -- /g;
$text =~ s/://g;
return $text;
}
## Do not use this inside `traverse': it throws off the traversal. Use
## html_replace_by_ignore or html_replace_by_meta instead.
# Returns 1 if success, 0 if failure.
sub html_remove ( $;$ )
{ my ($he, $parent) = check_args_range(1, 2, @_);
if (!defined $parent)
{ $parent = $he->parent; }
my $ref_pcontent = $parent->content;
my @pcontent = @{$ref_pcontent};
for (my $i=0; $i<scalar(@pcontent); $i++)
{ if ($pcontent[$i] eq $he)
{ splice @{$ref_pcontent}, $i, 1;
$he->parent(undef);
return 1; } }
die "Didn't find $he in $parent";
}
sub html_replace ( $$;$ )
{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);
if (!defined $parent)
{ $parent = $orig->parent; }
my $ref_pcontent = $parent->content;
my @pcontent = @{$ref_pcontent};
for (my $i=0; $i<scalar(@pcontent); $i++)
{ if ($pcontent[$i] eq $orig)
{ $ {$ref_pcontent}[$i] = $new;
$new->parent($parent);
$orig->parent(undef);
return 1; } }
die "Didn't find $orig in $parent";
}
sub html_replace_by_meta ( $;$ )
{ my ($orig, $parent) = check_args_range(1, 2, @_);
my $meta = new HTML::Element "meta";
if (!defined $parent)
{ $parent = $orig->parent; }
return html_replace($orig, $meta, $parent);
}
sub html_replace_by_ignore ( $;$ )
{ my ($orig, $parent) = check_args_range(1, 2, @_);
my $ignore = new HTML::Element "ignore";
if (!defined $parent)
{ $parent = $orig->parent; }
return html_replace($orig, $ignore, $parent);
}
###
### Collect text elements
###
my @collected_texts;
my $collect_texts_stoppoint;
my $done_collecting;
sub collect_texts ( $;$ )
{ my ($root, $stop) = check_args_range(1, 2, @_);
# print STDERR "collect_texts: $root $stop\n";
$collect_texts_stoppoint = $stop;
$done_collecting = 0;
@collected_texts = ();
$root->traverse(\&collect_if_text); # process texts
# print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
return @collected_texts;
}
sub collect_if_text ( $$$ )
{ my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments
if ($done_collecting)
{ return 0; }
if (!defined $he)
{ return 0; }
if (!ref $he)
{ push @collected_texts, $he;
return 0; }
if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
{ $done_collecting = 1;
return 0; }
return 1;
}
###########################################################################
### Clean up parse tree
###
sub cleanup_parse_tree ( $ )
{ my ($he) = check_args(1, @_);
$he->traverse(\&delete_if_navigation, 'ignore text');
$he->traverse(\&delete_extra_spaces, 'ignore text');
$he->traverse(\&merge_dl, 'ignore text');
return $he;
}
## Simpler version that deletes contents but not the element itself.
# sub delete_if_navigation ( $$$ )
# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
# { $he->delete();
# return 0; }
# else
# { return 1; }
# }
sub delete_if_navigation ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
{ my $ref_pcontent = $he->parent()->content();
# Don't try to modify @pcontent, which appears to be a COPY.
# my @pcontent = @{$ref_pcontent};
for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
{ if (${$ref_pcontent}[$i] eq $he)
{ splice(@{$ref_pcontent}, $i, 1);
last; } }
$he->delete();
return 0; }
else
{ return 1; }
}
sub delete_extra_spaces ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
my $tag = $he->tag;
if ($tag =~ /^(head|html|table|tr|ul)$/)
{ delete_child_spaces($he); }
delete_trailing_spaces($he);
return 1;
}
sub delete_child_spaces ( $ )
{ my ($he) = check_args(1, @_);
my $ref_content = $he->content();
for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
{ if ($ {$ref_content}[$i] =~ /^ *$/)
{ splice(@{$ref_content}, $i, 1);
$i--; } }
}
sub delete_trailing_spaces ( $ )
{ my ($he) = check_args(1, @_);
my $ref_content = $he->content();
if (! defined $ref_content)
{ return; }
# Could also check for previous element = /^h[1-6]$/.
for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
{ if ($ {$ref_content}[$i] =~ /^ *$/)
{ my $next_elt = $ {$ref_content}[$i+1];
if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
{ splice(@{$ref_content}, $i, 1);
$i--; } } }
if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
{ my $last_elt = $ {$ref_content}[$#{$ref_content}];
if ((defined $last_elt) && ($last_elt =~ /^ *$/))
{ pop @{$ref_content}; } }
}
# If we find a paragraph that looks like
# <P>
# <HR>
# <UL>
# then accumulate its links into a contents_list and delete the paragraph.
sub process_if_child_links ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if ($he->tag() eq "p")
{ my $ref_content = $he->content();
if (defined $ref_content)
{ my @content = @{$ref_content};
if ((scalar(@content) == 2)
&& (ref $content[0]) && $content[0]->tag() eq "hr"
&& (ref $content[1]) && $content[1]->tag() eq "ul")
{ process_child_links($he);
$he->delete();
return 0; } } }
return 1;
}
# If we find
# <H4>
# "Footnotes"
# <DL>
# <DT>
# <A NAME="foot560">
# "...borrow"
# <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
# "1.2"
# <DD>
# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
# ...
# then record the footnote information and delete the section and list.
my $process_if_footnotes_expect_dl_next = 0;
sub process_if_footnotes ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if (($he->tag() eq "h4")
&& has_single_content_string($he)
&& ($ {$he->content}[0] eq "Footnotes"))
{ html_replace_by_ignore($he);
$process_if_footnotes_expect_dl_next = 1;
return 0; }
if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
{ my $ref_content = $he->content();
if (defined $ref_content)
{ $process_if_footnotes_expect_dl_next = 0;
my @content = @{$ref_content};
for (my $i=0; $i<$#content; $i+=2)
{ my $he_dt = $content[$i];
my $he_dd = $content[$i+1];
if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))
{ $he->dump;
die "expected <DT> and <DD> at positions $i and ", $i+1; }
my @dt_content = @{$he_dt->content()};
if ((scalar(@dt_content) != 2)
|| ($dt_content[0]->tag ne "a")
|| ($dt_content[1]->tag ne "a"))
{ $he_dt->dump;
die "Expected 2 anchors as content of <DT>"; }
my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
# unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
if ($dt1_name ne $dt2_name)
{ $he_dt->dump;
die "Expected identical names for anchors"; }
html_replace_by_ignore($he_dd);
$he_dd->tag("div"); # has no effect
$footnotes{$dt1_name} = $he_dd; }
html_replace_by_ignore($he);
return 0; } }
if ($process_if_footnotes_expect_dl_next)
{ $he->dump;
die "Expected <DL> for footnotes next"; }
return 1;
}
## Merge two adjacent paragraphs containing <DL> items, such as:
# <P>
# <DL>
# <DT>
# ...
# <DD>
# ...
# <P>
# <DL>
# <DT>
# ...
# <DD>
# ...
sub merge_dl ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
my $ref_content = $he->content;
if (!defined $ref_content)
{ return; }
my $i = 0;
while ($i < scalar(@{$ref_content})-1)
{ my $p1 = $ {$ref_content}[$i];
if ((ref $p1) && ($p1->tag eq "p")
&& has_single_content_with_tag($p1, "dl"))
{ my $dl1 = $ {$p1->content}[0];
# In this loop, rhs, not lhs, of < comparison changes,
# because we are removing elements from the content of $he.
while ($i < scalar(@{$ref_content})-1)
{ my $p2 = $ {$ref_content}[$i+1];
if (!((ref $p2) && ($p2->tag eq "p")
&& has_single_content_with_tag($p2, "dl")))
{ last; }
# Merge these two elements.
splice(@{$ref_content}, $i+1, 1); # remove $p2
my $dl2 = $ {$p2->content}[0];
$dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
}
# extra increment because next element isn't a candidate for $p1
$i++; }
$i++; }
return 1;
}
###########################################################################
### Testing
###
sub test ( $$ )
{ my ($action, $file) = check_args(2, @_);
# General testing
if (($action eq "view") || ($action eq ""))
{ # # $file = "/homes/gws/mernst/www/links.html";
# # $file = "/homes/gws/mernst/www/index.html";
# # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
# # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
# # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
my $tree = file_to_tree($file);
## Testing
# print STDERR $tree->as_HTML;
$tree->dump();
# print STDERR $tree->tag(), "\n";
# print STDERR @{$tree->content()}, "\n";
#
# for (@{ $tree->extract_links(qw(a img)) }) {
# my ($link, $linkelem) = @$_;
# print STDERR "$link ", $linkelem->as_HTML;
# }
#
# print STDERR @{$tree->extract_links()}, "\n";
# my @top_level_elts = @{$tree->content()};
# if scalar(@{$tree->content()})
return;
}
elsif ($action eq "raw")
{ my $tree = new HTML::TreeBuilder;
$tree->ignore_unknown(1);
# $tree->warn(1);
$tree->parse_file($file);
$tree->dump();
# cleanup_parse_tree($tree);
# $tree->dump();
return;
}
# Test dealing with a section.
elsif ($action eq "section")
{ # my $file;
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
process_section_file($file, 0, "Title");
}
# Test dealing with many sections
elsif (0)
{ my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
"/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
"/homes/fish/mernst/tmp/python-doc/html/api/api.html",
"/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
# "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
"/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
"/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
"/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
"/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
"/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
"/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/front.html",
"/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
# "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
"/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
"/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
"/homes/fish/mernst/tmp/python-doc/html/api/index.html",
"/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
"/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
"/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
"/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
"/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
"/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
"/homes/fish/mernst/tmp/python-doc/html/api/number.html",
"/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/object.html",
"/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/os.html",
"/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
"/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
"/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
"/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
"/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
"/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
"/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/types.html",
"/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
"/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
for my $file (@files)
{ print STDERR "\n", "=" x 75, "\n", "$file:\n";
process_section_file($file, 0, "Title");
}
}
# Test dealing with index.
elsif ($action eq "index")
{ # my $file;
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
process_index_file($file, "\@cindex");
print_index_info();
}
else
{ die "Unrecognized action `$action'"; }
}
###########################################################################
### Main loop
###
sub process_contents_file ( $ )
{ my ($file) = check_args(1, @_);
# could also use File::Basename
my $info_file = $file;
$info_file =~ s/(\/?index)?\.html$//;
if ($info_file eq "")
{ chomp($info_file = `pwd`); }
$info_file =~ s/^.*\///; # not the most efficient way to remove dirs
$html_directory = $file;
$html_directory =~ s/(\/|^)[^\/]+$/$1/;
my $texi_file = "$info_file.texi";
open(TEXI, ">$texi_file");
print TEXI "\\input texinfo \@c -*-texinfo-*-\n";
print TEXI "\@c %**start of header\n";
print TEXI "\@setfilename $info_file\n";
# 2. Summary Description and Copyright
# The "Summary Description and Copyright" segment describes the
# document and contains the copyright notice and copying permissions
# for the Info file. The segment must be enclosed between `@ifinfo'
# and `@end ifinfo' commands so that the formatters place it only in
# the Info file.
#
# The summary description and copyright segment does not appear in the
# printed document.
#
# @ifinfo
# This is a short example of a complete Texinfo file.
#
# Copyright @copyright{} 1990 Free Software Foundation, Inc.
# @end ifinfo
# 3. Title and Copyright
# The "Title and Copyright" segment contains the title and copyright
# pages and copying permissions for the printed manual. The segment
# must be enclosed between `@titlepage' and `@end titlepage'
# commands. The title and copyright page appear only in the printed
# manual.
#
# The titlepage segment does not appear in the Info file.
#
# @titlepage
# @sp 10
# @comment The title is printed in a large font.
# @center @titlefont{Sample Title}
#
# @c The following two commands start the copyright page.
# @page
# @vskip 0pt plus 1filll
# Copyright @copyright{} 1990 Free Software Foundation, Inc.
# @end titlepage
# 4. `Top' Node and Master Menu
# The "Master Menu" contains a complete menu of all the nodes in the
# whole Info file. It appears only in the Info file, in the `Top'
# node.
#
# The `Top' node contains the master menu for the Info file. Since a
# printed manual uses a table of contents rather than a menu, the master
# menu appears only in the Info file.
#
# @node Top, First Chapter, , (dir)
# @comment node-name, next, previous, up
#
# @menu
# * First Chapter:: The first chapter is the
# only chapter in this sample.
# * Concept Index:: This index has two entries.
# @end menu
$current_ref_tdf = [ "Top", 0, $ARGV[0] ];
process_section_file($file, 0, "Top");
while (scalar(@contents_list))
{ $current_ref_tdf = shift @contents_list;
process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
}
print TEXI "\n";
for my $indextitle (@index_titles)
{ print TEXI "\@node $indextitle\n";
print TEXI "\@unnumbered $indextitle\n";
print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
print TEXI "\n"; }
print TEXI "\@contents\n";
print TEXI "\@bye\n";
close(TEXI);
}
# This needs to be last so global variable initializations are reached.
if (scalar(@ARGV) == 0)
{ die "No arguments supplied to html2texi.pl"; }
if ($ARGV[0] eq "-test")
{ my @test_args = @ARGV[1..$#ARGV];
if (scalar(@test_args) == 0)
{ test("", "index.html"); }
elsif (scalar(@test_args) == 1)
{ test("", $test_args[0]); }
elsif (scalar(@test_args) == 2)
{ test($test_args[0], $test_args[1]); }
else
{ die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
exit();
}
if (scalar(@ARGV) != 1)
{ die "Pass one argument, the main/contents page"; }
process_contents_file($ARGV[0]);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment