62 lines
1.6 KiB
Python
62 lines
1.6 KiB
Python
# scrape latin texts from http://penelope.uchicago.edu/Thayer/E/Roman/home.html
|
|
|
|
import bs4
|
|
import re
|
|
import requests
|
|
|
|
def load(url):
|
|
r = requests.get(url)
|
|
return bs4.BeautifulSoup(r.content, 'lxml')
|
|
|
|
def extract(soup):
|
|
root = soup.new_tag('div')
|
|
|
|
header = soup.find('table', class_=('headerbox', 'header'))
|
|
for tag in list(header.next_siblings):
|
|
if hasattr(tag, 'attrs') and 'endnotes' in tag.attrs.get('class', ()):
|
|
break
|
|
root.append(tag.extract())
|
|
|
|
bad = ('pagenum', 'linenum', 'translation_flag', 'verse_speaker',
|
|
'ref', 'sec', 'chapter')
|
|
for p in root.find_all('p'):
|
|
for x in p.find_all(class_=bad):
|
|
x.decompose()
|
|
for s in p.strings:
|
|
yield str(s).strip()
|
|
|
|
|
|
|
|
def gather_texts(soup):
|
|
for a in soup.find_all('a', href=re.compile('E/Roman/Texts')):
|
|
if a.find_next_siblings('img', src=re.compile('Vatican')):
|
|
yield a.attrs['href'].strip()
|
|
|
|
def gather_parts(s):
|
|
for a in s.find_all('a', href=re.compile('L/Roman/Texts')):
|
|
x = a.attrs['href'].strip()
|
|
i = x.find('#')
|
|
if i != -1:
|
|
x = x[:i]
|
|
yield x
|
|
|
|
|
|
BASE = 'http://penelope.uchicago.edu/Thayer/'
|
|
|
|
#part_clean = re.compile()
|
|
PARTS = set()
|
|
for t in gather_texts(load(BASE + 'E/HELP/Indexes/books.html')):
|
|
for part in gather_parts(load(BASE + t)):
|
|
PARTS.add(part)
|
|
|
|
for p in PARTS:
|
|
try:
|
|
name = '.'.join(p[14:-6].split('/'))
|
|
print(name)
|
|
with open(name, 'w+') as out:
|
|
for s in extract(load(BASE + p)):
|
|
out.write(s)
|
|
out.write('\n')
|
|
except Exception as err:
|
|
print(err)
|