Beginning of jargon parser
This commit is contained in:
parent
be7b4bbc95
commit
f112f45b90
@ -1,20 +1,40 @@
|
|||||||
import os
|
import os
|
||||||
import HTMLParser
|
import HTMLParser, urllib, urlparse
|
||||||
|
|
||||||
|
class JargonParser(HTMLParser.HTMLParser):
|
||||||
|
def __init__ (self):
|
||||||
|
HTMLParser.HTMLParser.__init__ (self)
|
||||||
|
self.seen = {}
|
||||||
|
self.currentSection=''
|
||||||
|
self.title = ''
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self.currentSection is not '':
|
||||||
|
if "head" in self.currentSection:
|
||||||
|
# store the title
|
||||||
|
self.title = data
|
||||||
|
print "Title: " + self.title
|
||||||
|
else:
|
||||||
|
print self.currentSection + ": " + data
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if "head" in self.currentSection or "body" in self.currentSection:
|
||||||
|
currentSection = '';
|
||||||
|
def handle_starttag(self, tag, attributes):
|
||||||
|
if "head" in tag or "body" in tag:
|
||||||
|
self.currentSection = tag;
|
||||||
|
#print "Tag: " + tag
|
||||||
|
|
||||||
def jargonReadFile(filename):
|
def jargonReadFile(filename):
|
||||||
inFile = open(filename)
|
inFile = open(filename)
|
||||||
buffer = ""
|
buffer = ""
|
||||||
for line in inFile:
|
for line in inFile:
|
||||||
buffer = buffer + line
|
buffer = buffer + line
|
||||||
parser = HTMLParser.HTMLParser()
|
parser = JargonParser()
|
||||||
parser.feed(buffer)
|
parser.feed(buffer)
|
||||||
|
|
||||||
def jargonImport(rootDir):
|
def jargonImport(rootDir):
|
||||||
for dirName, subdirList, fileList in os.walk(rootDir):
|
for dirName, subdirList, fileList in os.walk(rootDir):
|
||||||
print('Found directory: %s' % dirName)
|
|
||||||
for filename in fileList:
|
for filename in fileList:
|
||||||
print('\t%s' % filename)
|
|
||||||
jargonReadFile(dirName + '/' + filename)
|
jargonReadFile(dirName + '/' + filename)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
jargonImport('original')
|
jargonImport('../original')
|
||||||
|
Loading…
Reference in New Issue
Block a user