Beginning of jargon parser

This commit is contained in:
Bob Mottram 2014-04-06 13:23:46 +01:00
parent be7b4bbc95
commit f112f45b90

View File

@ -1,20 +1,40 @@
import os
import HTMLParser
import HTMLParser, urllib, urlparse
class JargonParser(HTMLParser.HTMLParser):
def __init__ (self):
HTMLParser.HTMLParser.__init__ (self)
self.seen = {}
self.currentSection=''
self.title = ''
def handle_data(self, data):
if self.currentSection is not '':
if "head" in self.currentSection:
# store the title
self.title = data
print "Title: " + self.title
else:
print self.currentSection + ": " + data
def handle_endtag(self, tag):
if "head" in self.currentSection or "body" in self.currentSection:
currentSection = '';
def handle_starttag(self, tag, attributes):
if "head" in tag or "body" in tag:
self.currentSection = tag;
#print "Tag: " + tag
def jargonReadFile(filename):
inFile = open(filename)
buffer = ""
for line in inFile:
buffer = buffer + line
parser = HTMLParser.HTMLParser()
parser = JargonParser()
parser.feed(buffer)
def jargonImport(rootDir):
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for filename in fileList:
print('\t%s' % filename)
jargonReadFile(dirName + '/' + filename)
if __name__ == "__main__":
jargonImport('original')
jargonImport('../original')