Protoball:Mlb chron to import csv.py

From Protoball
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
def mlb_chron_to_cvs():
	class Entry():
		def __init__(self):
			self.headline = ""
			self.year = 0
			self.ID = ""
			self.description = ""
			self.tags = []
			self.location = ""
			self.game = ""
			self.suffix = ""

		def prep(self):
			self.clean_ID_and_assign_suffix()
			self.clean_description()
			self.distribute_tags()

		def clean_ID_and_assign_suffix(self):
			self.ID = self.ID.replace('C.', 'c.').replace('S.', 's.')
			if 'c.' in self.ID:
				self.suffix = 'c'
			elif 's.' in self.ID:
				self.suffix = 's'

		def clean_description(self):
			from bs4 import BeautifulSoup
			soup = BeautifulSoup(self.description)
			for p in soup.find_all("p"):
				if hasattr(p, "class"):
					del p["class"]
			if soup.html:
				soup.html.unwrap()
				soup.body.unwrap()
			self.description = str(soup)

		def distribute_tags(self):
			locations = ['California', 'Canada', 'Harvard', 'Illinois', 'New England', 'New Jersey', 'Philadelphia', 'South', 'Wisconsin', 'Western New York', 'Texas']
			games = ['Bat-Ball', 'Cricket', 'Oddball', 'Stoolball', 'Town Ball', 'Wicket', 'Xenoball', 'Rounders', 'Base Ball']
			sub = {}
			sub['WNY'] = 'Western New York'
			sub['PreKnicks'] = 'Pre-Knicks'
			sub['NewEngland'] = 'New England'
			sub['AfricanAmericans'] = 'African Americans'
			sub['NewJersey'] = 'New Jersey'
			sub['TownBall'] = 'Town Ball'
			sub['BaseBall'] = 'Base Ball'
			sub['BatBall'] = 'Bat-Ball'
			sub['IllinoisMissouri'] = 'Illinois'
			tags = list(self.tags)
			self.tags = []
			for tag in tags:
				tag = sub.get(tag, tag)
				if tag in locations:
					self.location = tag
				elif tag in games:
					self.game = tag
				else:
					self.tags.append(tag)
			

	from urllib import request
	from bs4 import BeautifulSoup
	date_ranges = ["1000-1500", "1501-1700", "1701-1800", "1801-1825", "1826-1870"]
	entries = []
	for date_range in date_ranges:
		r = request.urlopen("http://mlb.mlb.com/gen/hb/chron/{0}.xml".format(date_range))
		t = r.read()
		soup = BeautifulSoup(t, "xml")
		items = soup.find_all("item")
		for item in items:
			entry = Entry()
			entry.headline = item.select("field[key=title]")[0].string
			entry.ID = item.select("field[key=displayYear]")[0].string
			entry.description = item.select("field[key=description]")[0].string
			if not entry.description:
				continue
			entry.year = int(item.select("field[key=year]")[0].string)
			entry.tags = [itemTag['displayName'] for itemTag in item.find_all("itemTag")]
			entry.prep()
			entries.append(entry)
	
	import csv
	rows = []
	rows.append(['Title', 'Chronology Entry[Headline]', 'Chronology Entry[Year]', 'Chronology Entry[Year Suffix]', 'Chronology Entry[Is in main chronology]', 'Chronology Entry[Location]', 'Chronology Entry[Game]', 'Chronology Entry[Tags]', 'Chronology Entry[Text]'])

	(title, headline, year, suffix, in_main, location, game, tags, text) = (0, 1, 2, 3, 4, 5, 6, 7, 8)
	for entry in entries:
		row = ["", "", "", "", "", "", "", "", ""]
		row[title] = entry.ID
		row[headline] = entry.headline
		row[year] = entry.year
		row[suffix] = entry.suffix
		row[in_main] = "yes"
		row[location] = entry.location
		row[game] = entry.game
		row[tags] = ",".join(entry.tags)
		row[text] = entry.description
		rows.append(row)

	f = open("/home/dave/Desktop/chron_import.csv", 'w')
	writer = csv.writer(f)
	for row in rows:
		writer.writerow(row)
	f.close()