# scrape latin texts from http://penelope.uchicago.edu/Thayer/E/Roman/home.html

import bs4
import re
import requests

def load(url):
    r = requests.get(url)
    return bs4.BeautifulSoup(r.content, 'lxml')

def extract(soup):
    root = soup.new_tag('div')

    header = soup.find('table', class_=('headerbox', 'header'))
    for tag in list(header.next_siblings):
        if hasattr(tag, 'attrs') and 'endnotes' in tag.attrs.get('class', ()):
            break
        root.append(tag.extract())

    bad = ('pagenum', 'linenum', 'translation_flag', 'verse_speaker',
           'ref', 'sec', 'chapter')
    for p in root.find_all('p'):
        for x in p.find_all(class_=bad):
            x.decompose()
        for s in p.strings:
            yield str(s).strip()


def gather_texts(soup):
    for a in soup.find_all('a', href=re.compile('E/Roman/Texts')):
        if a.find_next_siblings('img', src=re.compile('Vatican')):
            yield a.attrs['href'].strip()

def gather_parts(s):
    for a in s.find_all('a', href=re.compile('L/Roman/Texts')):
        x = a.attrs['href'].strip()
        i = x.find('#')
        if i != -1:
            x = x[:i]
        yield x


BASE = 'http://penelope.uchicago.edu/Thayer/'

#part_clean = re.compile()
PARTS = set()
for t in gather_texts(load(BASE + 'E/HELP/Indexes/books.html')):
    for part in gather_parts(load(BASE + t)):
        PARTS.add(part)

for p in PARTS:
    try:
        name = '.'.join(p[14:-6].split('/'))
        print(name)
        with open(name, 'w+') as out:
            for s in extract(load(BASE + p)):
                out.write(s)
                out.write('\n')
    except Exception as err:
        print(err)