Ticket #54437: makeman

File makeman, 15.4 KB (added by BrentSqAR, 7 years ago)
Line 
1#!/usr/bin/python
2#
3# makeman -- compile netpbm's stereotyped HTML to troff markup
4#
5# This approach works because we control the entire document universe
6# this is going to convert and can reinforce useful stereotypes.
7#
8# The output of this tool uses cliches parseable by doclifter,
9# which should thus be able to recover all the semantic information
10# it looks like this thing is losing.
11#
12# Known bugs:
13#  * Ordered lists are smashed into unordered lists
14#
15# Limitations:
16#  * IMG tags are issued as .IMG preceded by a bolded caption containing
17#    the alt content.  This will only work if the page is formatted with
18#    mwww macros.
19#  * Loses summary information from tables.
20#  * Only permits one <HR> in the HTML, right before the index.
21#
22# You can use the <?makeman ?> PI to pass text directly through to the
23# generated manual page,  A major use is to insert format lines for tables.
24#
25# By Eric S. Raymond <esr@thyrsus.com>
26# Version 1.0, July 26 2004
27#
28# Modified by Akira F. Urushibata <afu@wta.att.ne.jp>
29# Version 1.1, February 11 2016
30#
31#   Added ability to process &mdash; &minus;
32#   Added footer message to clarify original source.
33#
34
35import os, sys, exceptions, re
36
37source = "netpbm documentation"
38section = 1
39
40warning = r'''\
41.\" This man page was generated by the Netpbm tool 'makeman' from HTML source.
42.\" Do not hand-hack it!  If you have bug fixes or improvements, please find
43.\" the corresponding HTML page on the Netpbm website, generate a patch
44.\" against that, and send it to the Netpbm maintainer.
45'''
46
47footerprefix = '''.SH DOCUMENT SOURCE
48This manual page was generated by the Netpbm tool 'makeman' from HTML
49source.  The master documentation is at
50.IP
51.B http://netpbm.sourceforge.net/doc/'''
52
53class LiftException(exceptions.Exception):
54    def __init__(self, message, retval=1):
55        self.message = message
56        self.retval = retval
57
58def makeman(name, file, indoc):
59    "Transform a string representing an HTML document into man markup."
60    global section, sectmap
61    # Dot at left margin confuses troff.
62    # This program generates these,
63    indoc = indoc.replace("\n.", "\n@%@%@")
64    # Protect escapes before we try generating font changes.
65    indoc = indoc.replace("\\", r"\e")
66    # Header-bashing
67    indoc = re.sub('(?i)<!DOCTYPE html[^>]*>', "", indoc)
68    indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">', "")
69    indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"/>', "")
70    indoc = indoc.replace('<?xml version="1.1" encoding="iso-8859-1" ?>\n',"")
71    indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml">', "")
72    indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">', "")
73    indoc = indoc.replace("<HEAD>", "").replace("</HEAD>", "")
74    indoc = indoc.replace("<head>", "").replace("</head>", "")
75    indoc = re.sub('(?i)<A HREF="#index">Table Of Contents</A>', "", indoc)
76    datematch = re.compile("Updated: (.*)\n")
77    match = datematch.search(indoc)
78    if match:
79        date = match.group(1)
80    else:
81        date = ""
82    indoc = datematch.sub("", indoc)
83    namematch = re.compile("<H1>(.*)</H1>", re.I)
84    match = namematch.search(indoc)
85    if match:
86        name = match.group(1)
87    else:
88        name = None
89    section = 1
90    meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
91    match = meta.search(indoc)
92    if match:
93        section = int(match.group(1))
94        indoc = meta.sub("", indoc)
95    else:
96        section = sectmap.get(name, 0)
97    indoc = namematch.sub("", indoc)
98    indoc = re.sub("(?i)<BODY[^>]*>", "", indoc)
99    indoc = re.sub("(?i)<HTML>", "", indoc)
100    # Remove more superfluous headers
101    titlematch = re.compile("<TITLE>(.*)</TITLE>\n+", re.I)
102    match = titlematch.search(indoc)
103    if match:
104        title = match.group(1)
105    else:
106        title = None
107    indoc = titlematch.sub("", indoc)
108    indoc = re.sub("(?i)\n*<BR>\n+", "\n", indoc)
109    indoc = re.sub("(?i)<BR>", "\n", indoc)
110    indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc
111    # Literal layout
112    indoc = re.sub("(?i)\n *<PRE>", "\n.nf", indoc)
113    indoc = re.sub("(?i)\n *</PRE>", "\n.fi", indoc)
114    indoc = re.sub("(?i)\n *<BLOCKQUOTE>", "\n.RS", indoc)
115    indoc = re.sub("(?i)\n *</BLOCKQUOTE>", "\n.RE", indoc)
116    # Highlight processing
117    indoc = re.sub("(?i)<B>", r"\\fB", indoc)
118    indoc = re.sub("(?i)</B>", r"\\fP", indoc)
119    indoc = re.sub("(?i)<EM>", r"\\fI", indoc)
120    indoc = re.sub("(?i)</EM>", r"\\fP", indoc)
121    indoc = re.sub("(?i)<CITE>", r"\\fI", indoc)
122    indoc = re.sub("(?i)</CITE>", r"\\fP", indoc)
123    indoc = re.sub("(?i)<I>", r"\\fI", indoc)
124    indoc = re.sub("(?i)</I>", r"\\fP", indoc)
125    indoc = re.sub("(?i)<TT>", r"\\f(CW", indoc)
126    indoc = re.sub("(?i)</TT>", r"\\fP", indoc)
127    indoc = re.sub("(?i)<KBD>", r"\\f(CW", indoc)
128    indoc = re.sub("(?i)</KBD>", r"\\fP", indoc)
129    indoc = re.sub("(?i)<CODE>", r"\\f(CW", indoc)
130    indoc = re.sub("(?i)</CODE>", r"\\fP", indoc)
131    indoc = re.sub("(?i)<STRONG>", r"\\fB", indoc)
132    indoc = re.sub("(?i)</STRONG>", r"\\fP", indoc)
133    indoc = re.sub("(?i)<SUP>", r"\\u", indoc)
134    indoc = re.sub("(?i)</SUP>", r"\\d", indoc)
135    # Paragraph handling
136    indoc = re.sub("(?i)\n*<P>\n*", r"\n.PP\n", indoc)
137    indoc = re.sub("(?i)<br */>", r"\n.PP\n", indoc)
138    indoc = re.sub("(?i)</P>", "", indoc)
139    indoc = re.sub("(?i)<!--[^>]*-->", "", indoc)
140    indoc = re.sub("(?i)<meta[^>]*>", "", indoc)
141    lines = indoc.split("\n")
142    listdepth = 0
143    for i in range(len(lines)):
144        lowered = lines[i].lower()
145        if "<dl" in lowered or "<ol" in lowered or "<ul" in lowered:
146            listdepth += 1
147        if listdepth:
148            lines[i] = lines[i].replace(".PP", ".sp")
149        if "</dl>" in lowered or "</ol>" in lowered or "</ul>" in lowered:
150            listdepth -= 1
151    indoc = "\n".join(lines)
152    indoc = re.sub(r"\s*\.sp", "\n.sp", indoc)
153    # Format email addresses as italic
154    indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc)   
155    # Format manual crossreferences
156    def xrefmatch(match):
157        xrefto = match.group(2)
158        xrefurl = match.group(1)
159        xrefsection = sectmap.get(xrefurl, 1)
160        if xrefsection == 0:
161            return "\n.I " + xrefto
162        else:
163            return '\n.BR "%s" (%d)\\c\n\\&' % (xrefto, xrefsection)
164    indoc = re.sub(r'(?i)\n* *(?:\\fB)?<A[ \n]+HREF="?([^>]+.html)"?>([^<]+)</A>(?:\\fP)?',
165                   xrefmatch, indoc)
166    # Format URLs
167    def urlmatch(match):
168        url = match.group(1).replace('\n', ' ')
169        txt = match.group(2).replace('\n', ' ')
170        return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt)
171    indoc = re.sub(r'(?i)\n*(?:&lt;)?<A[ \n]+HREF *= *"([^>]+)">([^<]+)</A>(?:&gt;)?',
172                  urlmatch, indoc)
173    # Turn some entities into harmless cookies
174    indoc = indoc.replace("&lt;", "@#!#@").replace("&gt;", "#@!@#").replace("&amp;", "#!@!@!#")
175    indoc = indoc.replace("&#215;", r"\(mu")
176    indoc = indoc.replace("&#174;", r"\*R")
177    indoc = indoc.replace("&copy;", r"\(co")
178    # Turn anchors into .UN tags
179    indoc = re.sub('(?i)<A NAME *= *"#?([a-zA-Z_][a-zA-Z_0-9.-]+)">(?:&nbsp;)*</A>\s*', ".UN \\1\n", indoc)
180    # Strip off the index trailer
181    trailer = re.compile('<HR */*>.*', re.DOTALL | re.IGNORECASE)
182    indoc = re.sub(trailer, "", indoc)
183    # If there was no index trailer, we still need to strip these
184    indoc = indoc.replace("</BODY>", "").replace("</HTML>", "")
185    indoc = indoc.replace("</body>", "").replace("</html>", "")
186    # Recognize sections with IDs
187    indoc = re.sub('(?i)<H2><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H2>',
188                   ".UN \\1\n.SH \\2", indoc)
189    indoc = re.sub('(?i)<H3><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H3>',
190                   ".UN \\1\n.SS \\2", indoc)
191    indoc = re.sub('(?i)<H4><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H4>',
192                   ".UN \\1\n.B \\2", indoc)
193    indoc = re.sub('(?i)<H2 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H2>',
194                   ".UN \\1\n.SH \\2", indoc)
195    indoc = re.sub('(?i)<H3 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H3>',
196                   ".UN \\1\n.SS \\2", indoc)
197    indoc = re.sub('(?i)<H4 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H4>',
198                   ".UN \\1\n.B \\2", indoc)
199    # Sections without IDs
200    indoc = re.sub('(?i)<H2>([^><]*)</H2>', ".SH \\1", indoc)
201    indoc = re.sub('(?i)<H3>([^><]*)</H3>', ".SS \\1", indoc)
202    indoc = re.sub('(?i)<H4>([^><]*)</H4>', ".B \\1", indoc)
203    #
204    # Process definition lists -- just turn them into .TPs
205    indoc = re.sub("(?i) *<DL *(COMPACT)?>", "", indoc)
206    indoc = re.sub("(?i) *</DL>", "", indoc)
207    indoc = re.sub("(?i) *<DT>", ".TP\n", indoc)
208    indoc = re.sub("(?i) *</DT>", "", indoc)
209    indoc = re.sub("(?i)\n*<DD>\n*", "\n", indoc)
210    indoc = re.sub("(?i) *</DD>", "", indoc)
211    # Process unordered lists -- just turn them into .TPs
212    indoc = re.sub("(?i)</?[UO]L *(COMPACT)?>", "", indoc)
213    indoc = re.sub("(?i) *<LI>", ".IP \(bu\n", indoc)
214    indoc = re.sub("(?i) *</LI>", "", indoc)
215    # No-print tags
216    indoc = re.sub("<!--no_print-->.*", "", indoc)
217    # Passthrough
218    indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc)
219    # Comments
220    indoc = re.sub("<!--([^\n])*-->", r'.\"\1', indoc)
221    # Acronyms
222    indoc = re.sub('<acronym [a-zA-Z0-9:= \n"]*>', "", indoc)
223    indoc = re.sub("</acronym>", "", indoc)
224    # Image tags
225    indoc = re.sub(' *<img src="([^"]*)" alt="([^"]*)"( *[a-z]*="?[0-9]*"?)*>', ".B \\2\n.IMG -C \\1", indoc)
226    # Special characters
227    indoc = indoc.replace("&quot;", "'")
228    indoc = indoc.replace("&nbsp;", "\\ ")
229    indoc = indoc.replace("&minus;", "-")
230    indoc = indoc.replace("&mdash;", "-")
231    indoc = indoc.replace("&mu;", "mu")
232    indoc = indoc.replace("&sigma;", "sigma")
233    # Tables
234    # This will not handle rowspan
235    indoc = re.sub('(?i) *<table[^>]*>.*', ".TS", indoc)
236    indoc = re.sub("(?i) *</table>.*", ".TE", indoc)
237    # First the single-line case
238    indoc = re.sub("(?i)</td> *<td>", "\t", indoc)
239    indoc = re.sub("(?i)<tr> *<td>", "", indoc)
240    indoc = re.sub("(?i)</td> *</tr>", "", indoc)
241    # Then the multiline case
242    indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<\n]*)</t[dh]>\s*', '\t\\1', indoc)
243    indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<]*)</t[dh]>\s*', '\tT{\n\\1T}', indoc)
244    indoc = indoc.replace("\n\\&T}", "\nT}")
245    indoc = re.sub("(?i) *</tr>", "", indoc)
246    indoc = re.sub("(?i) *<tr[^>]*>\t*", "", indoc)
247    indoc = re.sub(r"\.TS\s+<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>([^<]*)</[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>\s*", ".B \\1\n.TS\n", indoc)
248    # Debugging
249    #sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date))
250    # Time for error checking now
251    badlines = []
252    for line in indoc.split("\n"):
253        if "<" in line or ">" in line.replace(" >", "") or re.search(r'(?<!^\\)&.*;', line):
254            badlines.append(line)
255    if badlines:
256        sys.stderr.write(("Bad lines from %s:\n-----------------\n" % file) + "\n".join(badlines) + "\n-----------------\n")
257    # Goes after bad-line check so we don't misinterpret it as an error
258    indoc = indoc.replace("@#!#@", "<").replace("#@!@#", ">").replace("#!@!@!#", "&")
259    indoc = re.sub("\n+$", "\n", indoc)
260    # Single-quote at left margin confuses troff.
261    # This program never generates these.
262    indoc = indoc.replace("\n'", "\n\\&'")
263    # Finish guarding against leading dots.
264    indoc = indoc.replace("\n@%@%@", "\n\\&.")
265    # Mark these generated pages so people won't hand-hack them.
266    indoc = warning + indoc
267    indoc = indoc + footerprefix + os.path.basename(file) +"\n.PP"
268    return indoc
269
270def main(args, mainout=sys.stdout, mainerr=sys.stderr):
271    global sectmap
272    import getopt
273    (options, arguments) = getopt.getopt(args, "vd:")
274    dirprefix = ""
275    verbosity = 0
276    for (switch, val) in options:
277        if switch == '-d':      # Set HTML input directory
278            dirprefix = val
279        elif switch == '-v':    # Enable verbose error reporting
280            verbosity += 1
281    try:
282        # First pass: gather locations for crossreferences:
283        sectmap = {}
284        for file in arguments:
285            try: 
286                infp = open(os.path.join(dirprefix, file))
287            except:
288                sys.stderr.write("makeman: can't open %s\n" % file)
289                continue
290            indoc = infp.read()
291            infp.close()
292            namere = re.compile("<H1>(.*)</H1>", re.I)
293            namematch = namere.search(indoc)
294            titlere = re.compile("<TITLE>(.*)</TITLE>", re.I)
295            titlematch = titlere.search(indoc)
296            if not namematch:
297                raise LiftException("name missing from %s" % file)
298            if not titlematch:
299                raise LiftException("title missing from %s" % file)
300            else:
301                title = titlematch.group(1)
302                name = titlematch.group(1)
303            meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
304            match = meta.search(indoc)
305            if match:
306                section = int(match.group(1))
307                sectmap[title] = sectmap[file] = sectmap[name] = section
308            else:
309                sectmap[title] = sectmap[file] = sectmap[name] = 1
310            hr = re.compile("(?i)<HR>")
311            firsthr = hr.search(indoc)
312            if firsthr and hr.search(indoc[firsthr.start(0)+4:]):
313                LiftException("%s has two <HR> tags!" % file)
314        # Second pass: do formatting
315        for file in arguments:
316            try: 
317                infp = open(os.path.join(dirprefix, file))
318            except:
319                sys.stderr.write("makeman: can't open %s\n" % file)
320                continue
321            indoc = infp.read()
322            infp.close()
323            tempfile = file + ".~%s-%d~" % (name, os.getpid())
324            try:
325                outfp = open(tempfile, "w")
326            except OSError:
327                sys.stderr.write("%s: can't open tempfile" % name)
328                return True
329            try:
330                if verbosity:
331                    sys.stderr.write("makeman: %s\n" % file)
332                outdoc = makeman(name, file, indoc)
333            except:
334                os.remove(tempfile)
335                # Pass the exception upwards
336                (exc_type, exc_value, exc_traceback) = sys.exc_info()
337                raise exc_type, exc_value, exc_traceback
338            if outdoc == indoc:
339                os.remove(tempfile)
340            if outdoc is None:
341                continue
342            else:
343                outfp.write(outdoc)
344                outfp.close()   # under Windows you can't rename an open file
345                stem = file[:file.find(".")]
346                os.rename(tempfile, stem + "." + `sectmap[file]`)
347    except LiftException, e:
348        mainerr.write("makeman: " + e.message + "\n")
349        return e.retval
350    except IOError, e:
351        mainerr.write("makeman: file I/O error: %s\n" % e)
352        return 3
353    except KeyboardInterrupt:
354        mainerr.write("makeman: bailing out...\n")
355        return 4
356    except:
357        if verbosity:
358            (exc_type, exc_value, exc_traceback) = sys.exc_info()
359            raise exc_type, exc_value, exc_traceback
360        else:
361            mainerr.write("makeman: internal error!\n")
362            return 5
363
364if __name__ == "__main__":
365    # Run the main sequence
366    raise SystemExit, main(sys.argv[1:])
367
368# The following sets edit modes for GNU EMACS
369# Local Variables:
370# mode:python
371# End: