JargonFile/import/importjargon.py
2014-04-06 16:16:39 +01:00

101 lines
2.7 KiB
Python

import os
import HTMLParser, urllib, urlparse
class JargonFile(dict):
def __init__(self,*arg,**kw):
super(CustomDictOne, self).__init__(*arg, **kw)
def __getitem__(self, key):
val = dict.__getitem__(self, key)
print 'GET', key
return val
def __setitem__(self, key, val):
print 'SET', key, val
dict.__setitem__(self, key, val)
def __repr__(self):
dictrepr = dict.__repr__(self)
return '%s(%s)' % (type(self).__name__, dictrepr)
def update(self, *args, **kwargs):
print 'update', args, kwargs
for k, v in dict(*args, **kwargs).iteritems():
self[k] = v
class JargonParser(HTMLParser.HTMLParser):
def __init__ (self):
HTMLParser.HTMLParser.__init__ (self)
self.seen = {}
self.currentSection=''
self.title = ''
self.bodyText = ''
def handle_data(self, data):
if "head" in self.currentSection:
# store the title
self.title = data
self.bodyText = '';
elif "body" in self.currentSection:
replacements = [' ',' ',' ','\t','\r','\n']
for rep in replacements:
data = data.replace(rep,' ')
data = data.strip()
self.bodyText = self.bodyText + data + ' '
def handle_starttag(self, tag, attributes):
if "head" in tag or "body" in tag:
self.currentSection = tag;
def jargonSaneText(text):
if len(text) < 2:
return ''
initsplit = text.split(' : ')
if len(initsplit) < 2:
return ''
initial = True
newtext = ''
for txt in initsplit:
if not initial:
newtext = newtext + txt
initial = False
text = newtext
sentsplit = text.split('.')
if len(sentsplit) > 1:
ctr = 0
newtext = ''
for sent in sentsplit:
if ctr < len(sentsplit)-1:
newtext = newtext + sent + '.'
ctr = ctr + 1
text = newtext
text = text.replace(' . ','. ')
text = text.replace(' .','. ')
text = text.replace(' ',' ')
return text.strip()
def jargonReadFile(filename):
inFile = open(filename)
buffer = ""
for line in inFile:
buffer = buffer + line
parser = JargonParser()
parser.feed(buffer)
if parser.title is not '' and \
parser.bodyText is not '' and \
len(parser.title) > 1:
parser.bodyText = jargonSaneText(parser.bodyText)
print "Title: " + parser.title
print "Text: " + parser.bodyText + "\n"
def jargonImport(rootDir):
for dirName, subdirList, fileList in os.walk(rootDir):
for filename in fileList:
jargonReadFile(dirName + '/' + filename)
if __name__ == "__main__":
jargonImport('../original')