Utilizador:Giro720/Gutenberg2wiki.py

Este script foi escrito originalmente para o python 2.X, e requer instalado a biblioteca BeautifulSoup.

Tipicamente os livros do Projeto Gutenberg possuem a marcação de páginas através da margação <span class="pagenum">, embora o nome da classe possa variar conforme o livro.

#!/usr/bin/python
# -*- coding: utf-8  -*-

from bs4 import BeautifulSoup as bs
import codecs
import urllib2
import os.path

try:
    import pypandoc
except:
    pass


def read_Gutenberg(url):
    """Le o texto do Projeto Gutenberg e salva-o locamente."""
    filename = url.split('/')[-1]
    if os.path.isfile(filename):
        print("Lendo arquivo local...")
        html = codecs.open(filename).read()
    else:
        print("Baixando livro da internet...")
        file(filename, "wb").write(urllib2.urlopen(url).read())
    return html


def clean_text(text):
    """Clean text."""
    text = unicode(text)
    try:
        text = text.replace(u"d\'", u"d’").replace(u"D\'", u"D’").strip()
        text = text.replace(u"n\'", u"n’").replace(u"N\'", u"N’")
        text = text.replace(u"h\'", u"h’")
        text = pypandoc.convert(text, 'mediawiki', format='html')
        text = text.replace(u"\r", u"")
    except:
        text = text.replace(u"d\'", u"d’").replace(u"D\'", u"D’").strip()
        text = text.replace(u"n\'", u"n’").replace(u"N\'", u"N’")
        text = text.replace(u"h\'", u"h’")
        text = text.replace(u"\r", u"").replace(u"\n", u" ").strip()
        text = text.replace(u"<b>", u"'''").replace(u"</b>", u"'''")
        text = text.replace(u"<em>", u"''").replace(u"</em>", u"''")
        text = text.replace(u"<it>", u"''").replace(u"</em>", u"'")
        text = text.replace(u"<p>", u"\n").replace(u"</p>", u"\n")
    text = text.replace(u"<br />", u"\n").replace(u"<br/>", u"\n")
    text = text.replace(u"―", u" ― ")
    text = text.replace(u"   ", u" ").replace(u"  ", u" ")
    text = text.replace(u"\n ", u"\n")
    text = text.replace(u" \n", u"\n")
    text = text.replace(u"\n\n\n", u"\n\n")
    return text


url = 'http://www.gutenberg.org/files/29120/29120-h/29120-h.htm'
html = read_Gutenberg(url)
outputfile = codecs.open(url.split('/')[-1] + "_output.txt", "w", "utf-8")
soup = bs(html)

for tag in soup.find_all('span', {"pagenum"})[0:-1]:
    print(tag)
    next_tag = tag.find_next('span', {"pagenum"})
    subtag = tag.next_sibling
    text = u""
    while next_tag != subtag:
        try:
            text = text + unicode(subtag)
        except:
            pass
        subtag = subtag.next_sibling
    outputfile.write(clean_text(text))
    outputfile.write(u"\n\n\n\n")

outputfile.close()