ppass/scrape.py

# scrape latin texts from http://penelope.uchicago.edu/Thayer/E/Roman/home.html

import bs4
import re
import requests

def load(url):
    r = requests.get(url)
    return bs4.BeautifulSoup(r.content, 'lxml')

def extract(soup):
    root = soup.new_tag('div')

    header = soup.find('table', class_=('headerbox', 'header'))
    for tag in list(header.next_siblings):
        if hasattr(tag, 'attrs') and 'endnotes' in tag.attrs.get('class', ()):
            break
        root.append(tag.extract())

    bad = ('pagenum', 'linenum', 'translation_flag', 'verse_speaker',
           'ref', 'sec', 'chapter')
    for p in root.find_all('p'):
        for x in p.find_all(class_=bad):
            x.decompose()
        for s in p.strings:
            yield str(s).strip()


def gather_texts(soup):
    for a in soup.find_all('a', href=re.compile('E/Roman/Texts')):
        if a.find_next_siblings('img', src=re.compile('Vatican')):
            yield a.attrs['href'].strip()

def gather_parts(s):
    for a in s.find_all('a', href=re.compile('L/Roman/Texts')):
        x = a.attrs['href'].strip()
        i = x.find('#')
        if i != -1:
            x = x[:i]
        yield x


BASE = 'http://penelope.uchicago.edu/Thayer/'

#part_clean = re.compile()
PARTS = set()
for t in gather_texts(load(BASE + 'E/HELP/Indexes/books.html')):
    for part in gather_parts(load(BASE + t)):
        PARTS.add(part)

for p in PARTS:
    try:
        name = '.'.join(p[14:-6].split('/'))
        print(name)
        with open(name, 'w+') as out:
            for s in extract(load(BASE + p)):
                out.write(s)
                out.write('\n')
    except Exception as err:
        print(err)
bigger corpus from http://penelope.uchicago.edu/Thayer/E/Roman/home.html 2020-01-12 19:35:04 +00:00			`# scrape latin texts from http://penelope.uchicago.edu/Thayer/E/Roman/home.html`

			`import bs4`
			`import re`
			`import requests`

			`def load(url):`
			`r = requests.get(url)`
			`return bs4.BeautifulSoup(r.content, 'lxml')`

			`def extract(soup):`
			`root = soup.new_tag('div')`

			`header = soup.find('table', class_=('headerbox', 'header'))`
			`for tag in list(header.next_siblings):`
			`if hasattr(tag, 'attrs') and 'endnotes' in tag.attrs.get('class', ()):`
			`break`
			`root.append(tag.extract())`

			`bad = ('pagenum', 'linenum', 'translation_flag', 'verse_speaker',`
			`'ref', 'sec', 'chapter')`
			`for p in root.find_all('p'):`
			`for x in p.find_all(class_=bad):`
			`x.decompose()`
			`for s in p.strings:`
			`yield str(s).strip()`



			`def gather_texts(soup):`
			`for a in soup.find_all('a', href=re.compile('E/Roman/Texts')):`
			`if a.find_next_siblings('img', src=re.compile('Vatican')):`
			`yield a.attrs['href'].strip()`

			`def gather_parts(s):`
			`for a in s.find_all('a', href=re.compile('L/Roman/Texts')):`
			`x = a.attrs['href'].strip()`
			`i = x.find('#')`
			`if i != -1:`
			`x = x[:i]`
			`yield x`


			`BASE = 'http://penelope.uchicago.edu/Thayer/'`

			`#part_clean = re.compile()`
			`PARTS = set()`
			`for t in gather_texts(load(BASE + 'E/HELP/Indexes/books.html')):`
			`for part in gather_parts(load(BASE + t)):`
			`PARTS.add(part)`

			`for p in PARTS:`
			`try:`
			`name = '.'.join(p[14:-6].split('/'))`
			`print(name)`
			`with open(name, 'w+') as out:`
			`for s in extract(load(BASE + p)):`
			`out.write(s)`
			`out.write('\n')`
			`except Exception as err:`
			`print(err)`