Protoball:Scrape Civil War.py

From Protoball
Jump to navigation Jump to search
def extract_cw_chrons(html_source):
    from bs4 import BeautifulSoup
    from bs4 import NavigableString

    def clean_html(soup):
        for child in soup.children:
            if isinstance(child, NavigableString):
                child.replace_with(child.string.replace("\n", ' ').replace("  ", " "))
            else:
                clean_html(child)
        kill_list = ['class', 'style']
        for kill in kill_list:
            if hasattr(soup, kill):
                del soup[kill]
        unwrap_list = ['span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st2:citation', 'st1:country-region']
        if soup.name in unwrap_list:
            soup.unwrap()

    def remove_empty_tags(soup):
        for child in soup.contents:
            if child.get_text(strip=True) == "":
                child.decompose()

    import re
    bracket_and_number = re.compile("^\[\d")
    cw = re.compile("CW\s?[–-]?\s?(\d+)")
    pball_file = re.compile("PBall file[^\d]+\d+\.?")

    def clean_text(text):
        soup = BeautifulSoup(text)
        clean_html(soup)
        remove_empty_tags(soup)
        remove_empty_tags(soup)
        text = str(soup)
        text = pball_file.sub("", text)
        return text

    class Chron:
	    def __init__(self, headline, ID, text):
		    self.headline = headline
		    self.ID = ID
		    self.text = text
	    def __str__(self):
		    return self.ID + " - " + self.headline + ": " + self.text


    soup = BeautifulSoup(html_source)

    ps = soup.find_all("p")

    chrons = []
    for p in ps:
	    text = p.get_text()
	    if bracket_and_number.match(text):
		    text = text.replace("[66 ", "[66] ").replace("[67 ", "[67] ")
		    headline = text[text.find(']') + 2:]
		    headline = headline.replace("\n", " ").replace("  ", " ").strip()
		    chrons.append(Chron(headline, "", ""))
	    elif chrons:
		    chrons[-1].text += str(p)


    prev_id = ""
    for chron in chrons:
	    m = cw.findall(chron.text)
	    if m:
		    chron.ID = m[0]
		    prev_id = chron.ID
	    else:
		    chron.ID = str(int(prev_id)+1)
	    chron.text = clean_text(chron.text)

    return chrons

def chrons_to_spreadsheet(chrons, path):
    import csv
    rows = []
    rows.append(['Title', 'Chronology Entry[Year Suffix]', 'Chronology Entry[Year]', 'Chronology Entry[Headline]', 'Chronology Entry[Text]', 'Chronology Entry[Tags]', 'Chronology Entry[Date]', 'Chronology Entry[Is in main chronology]'])
    (title, notation, year, headline, text, tags, date, in_main) = (0, 1, 2, 3, 4, 5, 6, 7)
    for chron in chrons:
        row = []
        for cell in rows[0]:
            row.append('')
        row[title] = "CW-" + chron.ID
        row[headline] = chron.headline
        row[text] = chron.text
        row[tags] = "Civil War"
        rows.append(row)

    f = open(path, 'w')
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)
    f.close()