Protoball:Scrape Civil War.py
Jump to navigation
Jump to search
def extract_cw_chrons(html_source): from bs4 import BeautifulSoup from bs4 import NavigableString def clean_html(soup): for child in soup.children: if isinstance(child, NavigableString): child.replace_with(child.string.replace("\n", ' ').replace(" ", " ")) else: clean_html(child) kill_list = ['class', 'style'] for kill in kill_list: if hasattr(soup, kill): del soup[kill] unwrap_list = ['span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st2:citation', 'st1:country-region'] if soup.name in unwrap_list: soup.unwrap() def remove_empty_tags(soup): for child in soup.contents: if child.get_text(strip=True) == "": child.decompose() import re bracket_and_number = re.compile("^\[\d") cw = re.compile("CW\s?[–-]?\s?(\d+)") pball_file = re.compile("PBall file[^\d]+\d+\.?") def clean_text(text): soup = BeautifulSoup(text) clean_html(soup) remove_empty_tags(soup) remove_empty_tags(soup) text = str(soup) text = pball_file.sub("", text) return text class Chron: def __init__(self, headline, ID, text): self.headline = headline self.ID = ID self.text = text def __str__(self): return self.ID + " - " + self.headline + ": " + self.text soup = BeautifulSoup(html_source) ps = soup.find_all("p") chrons = [] for p in ps: text = p.get_text() if bracket_and_number.match(text): text = text.replace("[66 ", "[66] ").replace("[67 ", "[67] ") headline = text[text.find(']') + 2:] headline = headline.replace("\n", " ").replace(" ", " ").strip() chrons.append(Chron(headline, "", "")) elif chrons: chrons[-1].text += str(p) prev_id = "" for chron in chrons: m = cw.findall(chron.text) if m: chron.ID = m[0] prev_id = chron.ID else: chron.ID = str(int(prev_id)+1) chron.text = clean_text(chron.text) return chrons def chrons_to_spreadsheet(chrons, path): import csv rows = [] rows.append(['Title', 'Chronology Entry[Year Suffix]', 'Chronology Entry[Year]', 'Chronology Entry[Headline]', 'Chronology Entry[Text]', 'Chronology Entry[Tags]', 'Chronology Entry[Date]', 'Chronology Entry[Is in main chronology]']) (title, notation, year, headline, text, tags, date, in_main) = (0, 1, 2, 3, 4, 5, 6, 7) for chron in chrons: row = [] for cell in rows[0]: row.append('') row[title] = "CW-" + chron.ID row[headline] = chron.headline row[text] = chron.text row[tags] = "Civil War" rows.append(row) f = open(path, 'w') writer = csv.writer(f) for row in rows: writer.writerow(row) f.close()