More parsing

This commit is contained in:
Bob Mottram 2014-04-26 14:04:57 +01:00
parent 2867871552
commit 21b3f80dc3

View File

@ -1,4 +1,5 @@
import os import os
import string
import HTMLParser, urllib, urlparse import HTMLParser, urllib, urlparse
class JargonFile(dict): class JargonFile(dict):
@ -33,7 +34,7 @@ class JargonParser(HTMLParser.HTMLParser):
def handle_data(self, data): def handle_data(self, data):
if "head" in self.currentSection: if "head" in self.currentSection:
# store the title # store the title
self.title = data self.title = data.strip()
self.bodyText = ''; self.bodyText = '';
elif "body" in self.currentSection: elif "body" in self.currentSection:
replacements = [' ',' ',' ','\t','\r','\n'] replacements = [' ',' ',' ','\t','\r','\n']
@ -45,21 +46,41 @@ class JargonParser(HTMLParser.HTMLParser):
if "head" in tag or "body" in tag: if "head" in tag or "body" in tag:
self.currentSection = tag; self.currentSection = tag;
def jargonSaneText(text): # Further sanitise the returned text
def jargonSaneText(title, text):
if len(text) < 2: if len(text) < 2:
return '' return ''
# usually in the format (title : text)
initsplit = text.split(' : ') initsplit = text.split(' : ')
if len(initsplit) < 2: if len(initsplit) < 2:
return '' # sometimes in the format (title[blurb] text)
initsplit = text.split('] ')
if len(initsplit) < 2:
# sometimes in the format (title adj. text)
initsplit = text.split(' adj. ')
initial = True # is all else fails look for the second instance of the title text
newtext = '' if len(initsplit) < 2:
for txt in initsplit: testsplit = text.split(title)
if not initial: if len(testsplit) >= 3:
newtext = newtext + txt initsplit = testsplit
initial = False initsplit[1] = ''
text = newtext testsplitctr = 0
for txt in testsplit:
if txt == ' ':
txt = title
if testsplitctr >= 2:
if testsplitctr >= 3:
initsplit[1] = initsplit[1] + ' '
initsplit[1] = initsplit[1] + txt
testsplitctr = testsplitctr + 1
if len(initsplit) < 2:
return ''
# get the second part of the split array (i.e. the description text)
text = initsplit[1]
sentsplit = text.split('.') sentsplit = text.split('.')
if len(sentsplit) > 1: if len(sentsplit) > 1:
@ -74,27 +95,72 @@ def jargonSaneText(text):
text = text.replace(' . ','. ') text = text.replace(' . ','. ')
text = text.replace(' .','. ') text = text.replace(' .','. ')
text = text.replace(' ',' ') text = text.replace(' ',' ')
text = filter(lambda x: x in string.printable, text)
return text.strip() return text.strip()
def jargonReadFile(filename): def validTitle(title):
if title is '':
return False
if '\xc2' in title:
return False
if title.startswith("Letters"):
return False
if title.startswith("Comments"):
return False
if title.startswith("Glossary"):
return False
return True
# remove any invalid characters from an entry title
# so thst it can be saved in a filename
def jargonSaneTitle(title):
if '/' in title:
title = title.replace('/','-')
return title
def jargonCreateEntry(title, text, outputDir):
# create the filename for the entry
filename = outputDir
if not outputDir.endswith('/'):
filename = filename + '/'
filename = filename + jargonSaneTitle(title) + '.txt'
# don't overwrite existing files
if os.path.isfile(filename):
return ''
fp = open(filename, 'w')
fp.write(title + '\n\n' + text + '\n')
fp.close
return filename
def jargonReadFile(filename, outputDir):
inFile = open(filename) inFile = open(filename)
buffer = "" buffer = ""
for line in inFile: for line in inFile:
buffer = buffer + line buffer = buffer + line
parser = JargonParser() parser = JargonParser()
parser.feed(buffer) parser.feed(buffer)
if parser.title is not '' and \ if validTitle(parser.title) and \
parser.bodyText is not '' and \ parser.bodyText is not '' and \
len(parser.title) > 1: len(parser.title) > 1:
parser.bodyText = jargonSaneText(parser.bodyText) saneBodyText = jargonSaneText(parser.title, parser.bodyText)
print "Title: " + parser.title print jargonCreateEntry(parser.title, saneBodyText, outputDir)
print "Text: " + parser.bodyText + "\n" #if saneBodyText == "":
#print "Title: " + parser.title
# print "Original: " + parser.bodyText
#print "Text: " + saneBodyText + "\n"
def jargonImport(rootDir): def jargonImport(rootDir, excludeEntriesFilename, outputDir):
for dirName, subdirList, fileList in os.walk(rootDir): for dirName, subdirList, fileList in os.walk(rootDir):
for filename in fileList: for filename in fileList:
jargonReadFile(dirName + '/' + filename) jargonReadFile(dirName + '/' + filename, outputDir)
if __name__ == "__main__": if __name__ == "__main__":
jargonImport('../original') jargonImport('../original','','../entries')