Context Navigation

Back to Ticket #54437

Ticket #54437: makeman

File makeman, 15.4 KB (added by BrentSqAR, 7 years ago)

Line
1	#!/usr/bin/python
2	#
3	# makeman -- compile netpbm's stereotyped HTML to troff markup
4	#
5	# This approach works because we control the entire document universe
6	# this is going to convert and can reinforce useful stereotypes.
7	#
8	# The output of this tool uses cliches parseable by doclifter,
9	# which should thus be able to recover all the semantic information
10	# it looks like this thing is losing.
11	#
12	# Known bugs:
13	# * Ordered lists are smashed into unordered lists
14	#
15	# Limitations:
16	# * IMG tags are issued as .IMG preceded by a bolded caption containing
17	# the alt content. This will only work if the page is formatted with
18	# mwww macros.
19	# * Loses summary information from tables.
20	# * Only permits one <HR> in the HTML, right before the index.
21	#
22	# You can use the <?makeman ?> PI to pass text directly through to the
23	# generated manual page, A major use is to insert format lines for tables.
24	#
25	# By Eric S. Raymond <esr@thyrsus.com>
26	# Version 1.0, July 26 2004
27	#
28	# Modified by Akira F. Urushibata <afu@wta.att.ne.jp>
29	# Version 1.1, February 11 2016
30	#
31	# Added ability to process — −
32	# Added footer message to clarify original source.
33	#
34
35	import os, sys, exceptions, re
36
37	source = "netpbm documentation"
38	section = 1
39
40	warning = r'''\
41	.\" This man page was generated by the Netpbm tool 'makeman' from HTML source.
42	.\" Do not hand-hack it! If you have bug fixes or improvements, please find
43	.\" the corresponding HTML page on the Netpbm website, generate a patch
44	.\" against that, and send it to the Netpbm maintainer.
45	'''
46
47	footerprefix = '''.SH DOCUMENT SOURCE
48	This manual page was generated by the Netpbm tool 'makeman' from HTML
49	source. The master documentation is at
50	.IP
51	.B http://netpbm.sourceforge.net/doc/'''
52
53	class LiftException(exceptions.Exception):
54	def __init__(self, message, retval=1):
55	self.message = message
56	self.retval = retval
57
58	def makeman(name, file, indoc):
59	"Transform a string representing an HTML document into man markup."
60	global section, sectmap
61	# Dot at left margin confuses troff.
62	# This program generates these,
63	indoc = indoc.replace("\n.", "\n@%@%@")
64	# Protect escapes before we try generating font changes.
65	indoc = indoc.replace("\\", r"\e")
66	# Header-bashing
67	indoc = re.sub('(?i)<!DOCTYPE html[^>]*>', "", indoc)
68	indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">', "")
69	indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"/>', "")
70	indoc = indoc.replace('<?xml version="1.1" encoding="iso-8859-1" ?>\n',"")
71	indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml">', "")
72	indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">', "")
73	indoc = indoc.replace("<HEAD>", "").replace("</HEAD>", "")
74	indoc = indoc.replace("<head>", "").replace("</head>", "")
75	indoc = re.sub('(?i)<A HREF="#index">Table Of Contents</A>', "", indoc)
76	datematch = re.compile("Updated: (.*)\n")
77	match = datematch.search(indoc)
78	if match:
79	date = match.group(1)
80	else:
81	date = ""
82	indoc = datematch.sub("", indoc)
83	namematch = re.compile("<H1>(.*)</H1>", re.I)
84	match = namematch.search(indoc)
85	if match:
86	name = match.group(1)
87	else:
88	name = None
89	section = 1
90	meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
91	match = meta.search(indoc)
92	if match:
93	section = int(match.group(1))
94	indoc = meta.sub("", indoc)
95	else:
96	section = sectmap.get(name, 0)
97	indoc = namematch.sub("", indoc)
98	indoc = re.sub("(?i)<BODY[^>]*>", "", indoc)
99	indoc = re.sub("(?i)<HTML>", "", indoc)
100	# Remove more superfluous headers
101	titlematch = re.compile("<TITLE>(.*)</TITLE>\n+", re.I)
102	match = titlematch.search(indoc)
103	if match:
104	title = match.group(1)
105	else:
106	title = None
107	indoc = titlematch.sub("", indoc)
108	indoc = re.sub("(?i)\n*<BR>\n+", "\n", indoc)
109	indoc = re.sub("(?i)<BR>", "\n", indoc)
110	indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc
111	# Literal layout
112	indoc = re.sub("(?i)\n *<PRE>", "\n.nf", indoc)
113	indoc = re.sub("(?i)\n *</PRE>", "\n.fi", indoc)
114	indoc = re.sub("(?i)\n *<BLOCKQUOTE>", "\n.RS", indoc)
115	indoc = re.sub("(?i)\n *</BLOCKQUOTE>", "\n.RE", indoc)
116	# Highlight processing
117	indoc = re.sub("(?i)<B>", r"\\fB", indoc)
118	indoc = re.sub("(?i)</B>", r"\\fP", indoc)
119	indoc = re.sub("(?i)<EM>", r"\\fI", indoc)
120	indoc = re.sub("(?i)</EM>", r"\\fP", indoc)
121	indoc = re.sub("(?i)<CITE>", r"\\fI", indoc)
122	indoc = re.sub("(?i)</CITE>", r"\\fP", indoc)
123	indoc = re.sub("(?i)<I>", r"\\fI", indoc)
124	indoc = re.sub("(?i)</I>", r"\\fP", indoc)
125	indoc = re.sub("(?i)<TT>", r"\\f(CW", indoc)
126	indoc = re.sub("(?i)</TT>", r"\\fP", indoc)
127	indoc = re.sub("(?i)<KBD>", r"\\f(CW", indoc)
128	indoc = re.sub("(?i)</KBD>", r"\\fP", indoc)
129	indoc = re.sub("(?i)<CODE>", r"\\f(CW", indoc)
130	indoc = re.sub("(?i)</CODE>", r"\\fP", indoc)
131	indoc = re.sub("(?i)<STRONG>", r"\\fB", indoc)
132	indoc = re.sub("(?i)</STRONG>", r"\\fP", indoc)
133	indoc = re.sub("(?i)<SUP>", r"\\u", indoc)
134	indoc = re.sub("(?i)</SUP>", r"\\d", indoc)
135	# Paragraph handling
136	indoc = re.sub("(?i)\n<P>\n", r"\n.PP\n", indoc)
137	indoc = re.sub("(?i)<br */>", r"\n.PP\n", indoc)
138	indoc = re.sub("(?i)</P>", "", indoc)
139	indoc = re.sub("(?i)<!--[^>]*-->", "", indoc)
140	indoc = re.sub("(?i)<meta[^>]*>", "", indoc)
141	lines = indoc.split("\n")
142	listdepth = 0
143	for i in range(len(lines)):
144	lowered = lines[i].lower()
145	if "<dl" in lowered or "<ol" in lowered or "<ul" in lowered:
146	listdepth += 1
147	if listdepth:
148	lines[i] = lines[i].replace(".PP", ".sp")
149	if "</dl>" in lowered or "</ol>" in lowered or "</ul>" in lowered:
150	listdepth -= 1
151	indoc = "\n".join(lines)
152	indoc = re.sub(r"\s*\.sp", "\n.sp", indoc)
153	# Format email addresses as italic
154	indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc)
155	# Format manual crossreferences
156	def xrefmatch(match):
157	xrefto = match.group(2)
158	xrefurl = match.group(1)
159	xrefsection = sectmap.get(xrefurl, 1)
160	if xrefsection == 0:
161	return "\n.I " + xrefto
162	else:
163	return '\n.BR "%s" (%d)\\c\n\\&' % (xrefto, xrefsection)
164	indoc = re.sub(r'(?i)\n* *(?:\\fB)?<A[ \n]+HREF="?([^>]+.html)"?>([^<]+)</A>(?:\\fP)?',
165	xrefmatch, indoc)
166	# Format URLs
167	def urlmatch(match):
168	url = match.group(1).replace('\n', ' ')
169	txt = match.group(2).replace('\n', ' ')
170	return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt)
171	indoc = re.sub(r'(?i)\n(?:<)?<A[ \n]+HREF = *"([^>]+)">([^<]+)</A>(?:>)?',
172	urlmatch, indoc)
173	# Turn some entities into harmless cookies
174	indoc = indoc.replace("<", "@#!#@").replace(">", "#@!@#").replace("&", "#!@!@!#")
175	indoc = indoc.replace("×", r"\(mu")
176	indoc = indoc.replace("®", r"\*R")
177	indoc = indoc.replace("©", r"\(co")
178	# Turn anchors into .UN tags
179	indoc = re.sub('(?i)<A NAME = "#?([a-zA-Z_][a-zA-Z_0-9.-]+)">(?: )</A>\s', ".UN \\1\n", indoc)
180	# Strip off the index trailer
181	trailer = re.compile('<HR />.*', re.DOTALL \| re.IGNORECASE)
182	indoc = re.sub(trailer, "", indoc)
183	# If there was no index trailer, we still need to strip these
184	indoc = indoc.replace("</BODY>", "").replace("</HTML>", "")
185	indoc = indoc.replace("</body>", "").replace("</html>", "")
186	# Recognize sections with IDs
187	indoc = re.sub('(?i)<H2><A (?:ID\|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H2>',
188	".UN \\1\n.SH \\2", indoc)
189	indoc = re.sub('(?i)<H3><A (?:ID\|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H3>',
190	".UN \\1\n.SS \\2", indoc)
191	indoc = re.sub('(?i)<H4><A (?:ID\|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H4>',
192	".UN \\1\n.B \\2", indoc)
193	indoc = re.sub('(?i)<H2 (?:ID\|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H2>',
194	".UN \\1\n.SH \\2", indoc)
195	indoc = re.sub('(?i)<H3 (?:ID\|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H3>',
196	".UN \\1\n.SS \\2", indoc)
197	indoc = re.sub('(?i)<H4 (?:ID\|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H4>',
198	".UN \\1\n.B \\2", indoc)
199	# Sections without IDs
200	indoc = re.sub('(?i)<H2>([^><]*)</H2>', ".SH \\1", indoc)
201	indoc = re.sub('(?i)<H3>([^><]*)</H3>', ".SS \\1", indoc)
202	indoc = re.sub('(?i)<H4>([^><]*)</H4>', ".B \\1", indoc)
203	#
204	# Process definition lists -- just turn them into .TPs
205	indoc = re.sub("(?i) <DL (COMPACT)?>", "", indoc)
206	indoc = re.sub("(?i) *</DL>", "", indoc)
207	indoc = re.sub("(?i) *<DT>", ".TP\n", indoc)
208	indoc = re.sub("(?i) *</DT>", "", indoc)
209	indoc = re.sub("(?i)\n<DD>\n", "\n", indoc)
210	indoc = re.sub("(?i) *</DD>", "", indoc)
211	# Process unordered lists -- just turn them into .TPs
212	indoc = re.sub("(?i)</?[UO]L *(COMPACT)?>", "", indoc)
213	indoc = re.sub("(?i) *<LI>", ".IP \(bu\n", indoc)
214	indoc = re.sub("(?i) *</LI>", "", indoc)
215	# No-print tags
216	indoc = re.sub("<!--no_print-->.*", "", indoc)
217	# Passthrough
218	indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc)
219	# Comments
220	indoc = re.sub("<!--([^\n])*-->", r'.\"\1', indoc)
221	# Acronyms
222	indoc = re.sub('<acronym [a-zA-Z0-9:= \n"]*>', "", indoc)
223	indoc = re.sub("</acronym>", "", indoc)
224	# Image tags
225	indoc = re.sub(' <img src="([^"])" alt="([^"])"( [a-z]="?[0-9]"?)*>', ".B \\2\n.IMG -C \\1", indoc)
226	# Special characters
227	indoc = indoc.replace(""", "'")
228	indoc = indoc.replace(" ", "\\ ")
229	indoc = indoc.replace("−", "-")
230	indoc = indoc.replace("—", "-")
231	indoc = indoc.replace("μ", "mu")
232	indoc = indoc.replace("σ", "sigma")
233	# Tables
234	# This will not handle rowspan
235	indoc = re.sub('(?i) <table[^>]>.*', ".TS", indoc)
236	indoc = re.sub("(?i) </table>.", ".TE", indoc)
237	# First the single-line case
238	indoc = re.sub("(?i)</td> *<td>", "\t", indoc)
239	indoc = re.sub("(?i)<tr> *<td>", "", indoc)
240	indoc = re.sub("(?i)</td> *</tr>", "", indoc)
241	# Then the multiline case
242	indoc = re.sub(r'(?i)\s<t[hd][^>]>([^<\n])</t[dh]>\s', '\t\\1', indoc)
243	indoc = re.sub(r'(?i)\s<t[hd][^>]>([^<])</t[dh]>\s', '\tT{\n\\1T}', indoc)
244	indoc = indoc.replace("\n\\&T}", "\nT}")
245	indoc = re.sub("(?i) *</tr>", "", indoc)
246	indoc = re.sub("(?i) <tr[^>]>\t*", "", indoc)
247	indoc = re.sub(r"\.TS\s+<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>([^<])</[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>\s", ".B \\1\n.TS\n", indoc)
248	# Debugging
249	#sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date))
250	# Time for error checking now
251	badlines = []
252	for line in indoc.split("\n"):
253	if "<" in line or ">" in line.replace(" >", "") or re.search(r'(?<!^\\)&.*;', line):
254	badlines.append(line)
255	if badlines:
256	sys.stderr.write(("Bad lines from %s:\n-----------------\n" % file) + "\n".join(badlines) + "\n-----------------\n")
257	# Goes after bad-line check so we don't misinterpret it as an error
258	indoc = indoc.replace("@#!#@", "<").replace("#@!@#", ">").replace("#!@!@!#", "&")
259	indoc = re.sub("\n+$", "\n", indoc)
260	# Single-quote at left margin confuses troff.
261	# This program never generates these.
262	indoc = indoc.replace("\n'", "\n\\&'")
263	# Finish guarding against leading dots.
264	indoc = indoc.replace("\n@%@%@", "\n\\&.")
265	# Mark these generated pages so people won't hand-hack them.
266	indoc = warning + indoc
267	indoc = indoc + footerprefix + os.path.basename(file) +"\n.PP"
268	return indoc
269
270	def main(args, mainout=sys.stdout, mainerr=sys.stderr):
271	global sectmap
272	import getopt
273	(options, arguments) = getopt.getopt(args, "vd:")
274	dirprefix = ""
275	verbosity = 0
276	for (switch, val) in options:
277	if switch == '-d': # Set HTML input directory
278	dirprefix = val
279	elif switch == '-v': # Enable verbose error reporting
280	verbosity += 1
281	try:
282	# First pass: gather locations for crossreferences:
283	sectmap = {}
284	for file in arguments:
285	try:
286	infp = open(os.path.join(dirprefix, file))
287	except:
288	sys.stderr.write("makeman: can't open %s\n" % file)
289	continue
290	indoc = infp.read()
291	infp.close()
292	namere = re.compile("<H1>(.*)</H1>", re.I)
293	namematch = namere.search(indoc)
294	titlere = re.compile("<TITLE>(.*)</TITLE>", re.I)
295	titlematch = titlere.search(indoc)
296	if not namematch:
297	raise LiftException("name missing from %s" % file)
298	if not titlematch:
299	raise LiftException("title missing from %s" % file)
300	else:
301	title = titlematch.group(1)
302	name = titlematch.group(1)
303	meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
304	match = meta.search(indoc)
305	if match:
306	section = int(match.group(1))
307	sectmap[title] = sectmap[file] = sectmap[name] = section
308	else:
309	sectmap[title] = sectmap[file] = sectmap[name] = 1
310	hr = re.compile("(?i)<HR>")
311	firsthr = hr.search(indoc)
312	if firsthr and hr.search(indoc[firsthr.start(0)+4:]):
313	LiftException("%s has two <HR> tags!" % file)
314	# Second pass: do formatting
315	for file in arguments:
316	try:
317	infp = open(os.path.join(dirprefix, file))
318	except:
319	sys.stderr.write("makeman: can't open %s\n" % file)
320	continue
321	indoc = infp.read()
322	infp.close()
323	tempfile = file + ".~%s-%d~" % (name, os.getpid())
324	try:
325	outfp = open(tempfile, "w")
326	except OSError:
327	sys.stderr.write("%s: can't open tempfile" % name)
328	return True
329	try:
330	if verbosity:
331	sys.stderr.write("makeman: %s\n" % file)
332	outdoc = makeman(name, file, indoc)
333	except:
334	os.remove(tempfile)
335	# Pass the exception upwards
336	(exc_type, exc_value, exc_traceback) = sys.exc_info()
337	raise exc_type, exc_value, exc_traceback
338	if outdoc == indoc:
339	os.remove(tempfile)
340	if outdoc is None:
341	continue
342	else:
343	outfp.write(outdoc)
344	outfp.close() # under Windows you can't rename an open file
345	stem = file[:file.find(".")]
346	os.rename(tempfile, stem + "." + `sectmap[file]`)
347	except LiftException, e:
348	mainerr.write("makeman: " + e.message + "\n")
349	return e.retval
350	except IOError, e:
351	mainerr.write("makeman: file I/O error: %s\n" % e)
352	return 3
353	except KeyboardInterrupt:
354	mainerr.write("makeman: bailing out...\n")
355	return 4
356	except:
357	if verbosity:
358	(exc_type, exc_value, exc_traceback) = sys.exc_info()
359	raise exc_type, exc_value, exc_traceback
360	else:
361	mainerr.write("makeman: internal error!\n")
362	return 5
363
364	if __name__ == "__main__":
365	# Run the main sequence
366	raise SystemExit, main(sys.argv[1:])
367
368	# The following sets edit modes for GNU EMACS
369	# Local Variables:
370	# mode:python
371	# End:

Download in other formats:

Original Format