Protoball:Mlb chron to import csv.py

From Protoball
Revision as of 07:36, 24 July 2012 by Dave (talk | contribs) (Dave moved page Mlb chron to import csv.py to Protoball:Mlb chron to import csv.py without leaving a redirect)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search
def mlb_chron_to_cvs():
	class Entry():
		def __init__(self):
			self.headline = ""
			self.year = 0
			self.ID = ""
			self.description = ""
			self.tags = []
			self.location = ""
			self.game = ""
			self.suffix = ""

		def prep(self):
			self.clean_ID_and_assign_suffix()
			self.clean_description()
			self.distribute_tags()

		def clean_ID_and_assign_suffix(self):
			self.ID = self.ID.replace('C.', 'c.').replace('S.', 's.')
			if 'c.' in self.ID:
				self.suffix = 'c'
			elif 's.' in self.ID:
				self.suffix = 's'

		def clean_description(self):
			from bs4 import BeautifulSoup
			soup = BeautifulSoup(self.description)
			for p in soup.find_all("p"):
				if hasattr(p, "class"):
					del p["class"]
			if soup.html:
				soup.html.unwrap()
				soup.body.unwrap()
			self.description = str(soup)

		def distribute_tags(self):
			locations = ['California', 'Canada', 'Harvard', 'Illinois', 'New England', 'New Jersey', 'Philadelphia', 'South', 'Wisconsin', 'Western New York', 'Texas']
			games = ['Bat-Ball', 'Cricket', 'Oddball', 'Stoolball', 'Town Ball', 'Wicket', 'Xenoball', 'Rounders', 'Base Ball']
			sub = {}
			sub['WNY'] = 'Western New York'
			sub['PreKnicks'] = 'Pre-Knicks'
			sub['NewEngland'] = 'New England'
			sub['AfricanAmericans'] = 'African Americans'
			sub['NewJersey'] = 'New Jersey'
			sub['TownBall'] = 'Town Ball'
			sub['BaseBall'] = 'Base Ball'
			sub['BatBall'] = 'Bat-Ball'
			sub['IllinoisMissouri'] = 'Illinois'
			tags = list(self.tags)
			self.tags = []
			for tag in tags:
				tag = sub.get(tag, tag)
				if tag in locations:
					self.location = tag
				elif tag in games:
					self.game = tag
				else:
					self.tags.append(tag)
			

	from urllib import request
	from bs4 import BeautifulSoup
	date_ranges = ["1000-1500", "1501-1700", "1701-1800", "1801-1825", "1826-1870"]
	entries = []
	for date_range in date_ranges:
		r = request.urlopen("http://mlb.mlb.com/gen/hb/chron/{0}.xml".format(date_range))
		t = r.read()
		soup = BeautifulSoup(t, "xml")
		items = soup.find_all("item")
		for item in items:
			entry = Entry()
			entry.headline = item.select("field[key=title]")[0].string
			entry.ID = item.select("field[key=displayYear]")[0].string
			entry.description = item.select("field[key=description]")[0].string
			if not entry.description:
				continue
			entry.year = int(item.select("field[key=year]")[0].string)
			entry.tags = [itemTag['displayName'] for itemTag in item.find_all("itemTag")]
			entry.prep()
			entries.append(entry)
	
	import csv
	rows = []
	rows.append(['Title', 'Chronology Entry[Headline]', 'Chronology Entry[Year]', 'Chronology Entry[Year Suffix]', 'Chronology Entry[Is in main chronology]', 'Chronology Entry[Location]', 'Chronology Entry[Game]', 'Chronology Entry[Tags]', 'Chronology Entry[Text]'])

	(title, headline, year, suffix, in_main, location, game, tags, text) = (0, 1, 2, 3, 4, 5, 6, 7, 8)
	for entry in entries:
		row = ["", "", "", "", "", "", "", "", ""]
		row[title] = entry.ID
		row[headline] = entry.headline
		row[year] = entry.year
		row[suffix] = entry.suffix
		row[in_main] = "yes"
		row[location] = entry.location
		row[game] = entry.game
		row[tags] = ",".join(entry.tags)
		row[text] = entry.description
		rows.append(row)

	f = open("/home/dave/Desktop/chron_import.csv", 'w')
	writer = csv.writer(f)
	for row in rows:
		writer.writerow(row)
	f.close()