More parsing
This commit is contained in:
parent
2867871552
commit
21b3f80dc3
@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import string
|
||||||
import HTMLParser, urllib, urlparse
|
import HTMLParser, urllib, urlparse
|
||||||
|
|
||||||
class JargonFile(dict):
|
class JargonFile(dict):
|
||||||
@ -33,7 +34,7 @@ class JargonParser(HTMLParser.HTMLParser):
|
|||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
if "head" in self.currentSection:
|
if "head" in self.currentSection:
|
||||||
# store the title
|
# store the title
|
||||||
self.title = data
|
self.title = data.strip()
|
||||||
self.bodyText = '';
|
self.bodyText = '';
|
||||||
elif "body" in self.currentSection:
|
elif "body" in self.currentSection:
|
||||||
replacements = [' ',' ',' ','\t','\r','\n']
|
replacements = [' ',' ',' ','\t','\r','\n']
|
||||||
@ -45,21 +46,41 @@ class JargonParser(HTMLParser.HTMLParser):
|
|||||||
if "head" in tag or "body" in tag:
|
if "head" in tag or "body" in tag:
|
||||||
self.currentSection = tag;
|
self.currentSection = tag;
|
||||||
|
|
||||||
def jargonSaneText(text):
|
# Further sanitise the returned text
|
||||||
|
def jargonSaneText(title, text):
|
||||||
if len(text) < 2:
|
if len(text) < 2:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
# usually in the format (title : text)
|
||||||
initsplit = text.split(' : ')
|
initsplit = text.split(' : ')
|
||||||
if len(initsplit) < 2:
|
if len(initsplit) < 2:
|
||||||
return ''
|
# sometimes in the format (title[blurb] text)
|
||||||
|
initsplit = text.split('] ')
|
||||||
|
if len(initsplit) < 2:
|
||||||
|
# sometimes in the format (title adj. text)
|
||||||
|
initsplit = text.split(' adj. ')
|
||||||
|
|
||||||
initial = True
|
# is all else fails look for the second instance of the title text
|
||||||
newtext = ''
|
if len(initsplit) < 2:
|
||||||
for txt in initsplit:
|
testsplit = text.split(title)
|
||||||
if not initial:
|
if len(testsplit) >= 3:
|
||||||
newtext = newtext + txt
|
initsplit = testsplit
|
||||||
initial = False
|
initsplit[1] = ''
|
||||||
text = newtext
|
testsplitctr = 0
|
||||||
|
for txt in testsplit:
|
||||||
|
if txt == ' ':
|
||||||
|
txt = title
|
||||||
|
if testsplitctr >= 2:
|
||||||
|
if testsplitctr >= 3:
|
||||||
|
initsplit[1] = initsplit[1] + ' '
|
||||||
|
initsplit[1] = initsplit[1] + txt
|
||||||
|
testsplitctr = testsplitctr + 1
|
||||||
|
|
||||||
|
if len(initsplit) < 2:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
# get the second part of the split array (i.e. the description text)
|
||||||
|
text = initsplit[1]
|
||||||
|
|
||||||
sentsplit = text.split('.')
|
sentsplit = text.split('.')
|
||||||
if len(sentsplit) > 1:
|
if len(sentsplit) > 1:
|
||||||
@ -74,27 +95,72 @@ def jargonSaneText(text):
|
|||||||
text = text.replace(' . ','. ')
|
text = text.replace(' . ','. ')
|
||||||
text = text.replace(' .','. ')
|
text = text.replace(' .','. ')
|
||||||
text = text.replace(' ',' ')
|
text = text.replace(' ',' ')
|
||||||
|
text = filter(lambda x: x in string.printable, text)
|
||||||
|
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
def jargonReadFile(filename):
|
def validTitle(title):
|
||||||
|
if title is '':
|
||||||
|
return False
|
||||||
|
|
||||||
|
if '\xc2' in title:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if title.startswith("Letters"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if title.startswith("Comments"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if title.startswith("Glossary"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
# remove any invalid characters from an entry title
|
||||||
|
# so thst it can be saved in a filename
|
||||||
|
def jargonSaneTitle(title):
|
||||||
|
if '/' in title:
|
||||||
|
title = title.replace('/','-')
|
||||||
|
return title
|
||||||
|
|
||||||
|
def jargonCreateEntry(title, text, outputDir):
|
||||||
|
# create the filename for the entry
|
||||||
|
filename = outputDir
|
||||||
|
if not outputDir.endswith('/'):
|
||||||
|
filename = filename + '/'
|
||||||
|
filename = filename + jargonSaneTitle(title) + '.txt'
|
||||||
|
|
||||||
|
# don't overwrite existing files
|
||||||
|
if os.path.isfile(filename):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
fp = open(filename, 'w')
|
||||||
|
fp.write(title + '\n\n' + text + '\n')
|
||||||
|
fp.close
|
||||||
|
return filename
|
||||||
|
|
||||||
|
def jargonReadFile(filename, outputDir):
|
||||||
inFile = open(filename)
|
inFile = open(filename)
|
||||||
buffer = ""
|
buffer = ""
|
||||||
for line in inFile:
|
for line in inFile:
|
||||||
buffer = buffer + line
|
buffer = buffer + line
|
||||||
parser = JargonParser()
|
parser = JargonParser()
|
||||||
parser.feed(buffer)
|
parser.feed(buffer)
|
||||||
if parser.title is not '' and \
|
if validTitle(parser.title) and \
|
||||||
parser.bodyText is not '' and \
|
parser.bodyText is not '' and \
|
||||||
len(parser.title) > 1:
|
len(parser.title) > 1:
|
||||||
parser.bodyText = jargonSaneText(parser.bodyText)
|
saneBodyText = jargonSaneText(parser.title, parser.bodyText)
|
||||||
print "Title: " + parser.title
|
print jargonCreateEntry(parser.title, saneBodyText, outputDir)
|
||||||
print "Text: " + parser.bodyText + "\n"
|
#if saneBodyText == "":
|
||||||
|
#print "Title: " + parser.title
|
||||||
|
# print "Original: " + parser.bodyText
|
||||||
|
#print "Text: " + saneBodyText + "\n"
|
||||||
|
|
||||||
def jargonImport(rootDir):
|
def jargonImport(rootDir, excludeEntriesFilename, outputDir):
|
||||||
for dirName, subdirList, fileList in os.walk(rootDir):
|
for dirName, subdirList, fileList in os.walk(rootDir):
|
||||||
for filename in fileList:
|
for filename in fileList:
|
||||||
jargonReadFile(dirName + '/' + filename)
|
jargonReadFile(dirName + '/' + filename, outputDir)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
jargonImport('../original')
|
jargonImport('../original','','../entries')
|
||||||
|
Loading…
Reference in New Issue
Block a user