HTML para LaTex: como posso usar python e lxml para converter um documento html para LaTeX com macros personalizadas

Question

Examinei muitas ferramentas e optei pelo lxml com uma função recursiva para mapear tags html para marcação LaTeX. Oferece um único local para definir facilmente seu mapeamento com Python. Acredito que trabalhei a partir de um exemplo do livro,Fundamentos da programação de rede Python

Aqui está um exemplo mínimo de trabalho em Python 2.7:

# convert html document to LaTeX
import lxml.html # http://lxml.de/lxmlhtml.html
from lxml import etree
from io import StringIO, BytesIO

def html2latex(el): # fill in this function to catch and convert html tags
    result = []
    if el.text:
        result.append(el.text)
    for sel in el:
        if False: # get info
            print('tag',sel.tag)
            print('text',sel.text)
            print('tail',sel.tail)
            print('attrib',sel.attrib)
        if sel.tag in ["h1"]:
            result.append('\hmchapter{%s}' % html2latex(sel))
        elif sel.tag in ["td", "table"]:
            result.append("<%s>" % sel.tag)
            result.append(html2latex(sel))
            result.append("</%s>" % sel.tag)
        elif sel.tag in ["span"]:  #
            for att in sel.attrib.keys():
                if att =='style':
                    if sel.attrib[att] == 'font-style:italic':
                        result.append(r'\textit{%s}' % (html2latex(sel)))
        else:
            result.append(html2latex(sel))
        if sel.tail:
            result.append(sel.tail)
    return "".join(result)

def main():
    # must be unicode or lxml parse crashes
    html = u'''
<!DOCTYPE html>
    <html>
  <head>
    <title></title>
  </head>
<body >
  <h1 class="hmchapter" data-hmvarbodychaptertitle = "My title">My title</h1>
  text <span style="font-style:italic">in a specific context</span> and more.
</body>
</html>
'''
    parser = etree.HTMLParser()
    tree   = etree.parse(StringIO(html), parser) # expects a file, use StringIO for string
    root = tree.getroot()
    latex = html2latex(root)
    print latex

if __name__ == '__main__':
    main()

que imprime:

\hmchapter{My title}
text \textit{in a specific context} and more.

Answer 1

Examinei muitas ferramentas e optei pelo lxml com uma função recursiva para mapear tags html para marcação LaTeX. Oferece um único local para definir facilmente seu mapeamento com Python. Acredito que trabalhei a partir de um exemplo do livro,Fundamentos da programação de rede Python

Aqui está um exemplo mínimo de trabalho em Python 2.7:

# convert html document to LaTeX
import lxml.html # http://lxml.de/lxmlhtml.html
from lxml import etree
from io import StringIO, BytesIO

def html2latex(el): # fill in this function to catch and convert html tags
    result = []
    if el.text:
        result.append(el.text)
    for sel in el:
        if False: # get info
            print('tag',sel.tag)
            print('text',sel.text)
            print('tail',sel.tail)
            print('attrib',sel.attrib)
        if sel.tag in ["h1"]:
            result.append('\hmchapter{%s}' % html2latex(sel))
        elif sel.tag in ["td", "table"]:
            result.append("<%s>" % sel.tag)
            result.append(html2latex(sel))
            result.append("</%s>" % sel.tag)
        elif sel.tag in ["span"]:  #
            for att in sel.attrib.keys():
                if att =='style':
                    if sel.attrib[att] == 'font-style:italic':
                        result.append(r'\textit{%s}' % (html2latex(sel)))
        else:
            result.append(html2latex(sel))
        if sel.tail:
            result.append(sel.tail)
    return "".join(result)

def main():
    # must be unicode or lxml parse crashes
    html = u'''
<!DOCTYPE html>
    <html>
  <head>
    <title></title>
  </head>
<body >
  <h1 class="hmchapter" data-hmvarbodychaptertitle = "My title">My title</h1>
  text <span style="font-style:italic">in a specific context</span> and more.
</body>
</html>
'''
    parser = etree.HTMLParser()
    tree   = etree.parse(StringIO(html), parser) # expects a file, use StringIO for string
    root = tree.getroot()
    latex = html2latex(root)
    print latex

if __name__ == '__main__':
    main()

que imprime:

\hmchapter{My title}
text \textit{in a specific context} and more.

HTML para LaTex: como posso usar python e lxml para converter um documento html para LaTeX com macros personalizadas

Responder1

informação relacionada