More parsing

2014-04-26 14:04:57 +01:00
parent 2867871552
commit 21b3f80dc3
1 changed files with 84 additions and 18 deletions
--- a/import/importjargon.py
+++ b/import/importjargon.py
@@ -1,4 +1,5 @@
 import os
 import string
 import HTMLParser, urllib, urlparse
 class JargonFile(dict):
@@ -33,7 +34,7 @@ class JargonParser(HTMLParser.HTMLParser):
    def handle_data(self, data):
        if "head" in self.currentSection:
            # store the title
-            self.title = data
+            self.title = data.strip()
            self.bodyText = '';
        elif "body" in self.currentSection:
            replacements = ['    ','   ','  ','\t','\r','\n']
@@ -45,21 +46,41 @@ class JargonParser(HTMLParser.HTMLParser):
        if "head" in tag or "body" in tag:
            self.currentSection = tag;
-def jargonSaneText(text):
+# Further sanitise the returned text
 def jargonSaneText(title, text):
    if len(text) < 2:
        return ''
    # usually in the format (title : text)
    initsplit = text.split(' : ')
    if len(initsplit) < 2:
-        return ''
+       # sometimes in the format (title[blurb] text)
       initsplit = text.split('] ')
       if len(initsplit) < 2:
          # sometimes in the format (title adj. text)
          initsplit = text.split(' adj. ')
-    initial = True
+    # is all else fails look for the second instance of the title text
-    newtext = ''
+    if len(initsplit) < 2:
-    for txt in initsplit:
+       testsplit = text.split(title)
-        if not initial:
+       if len(testsplit) >= 3:
-            newtext = newtext + txt
+          initsplit = testsplit
-        initial = False
+          initsplit[1] = ''
-    text = newtext
+          testsplitctr = 0
          for txt in testsplit:
             if txt == ' ':
                txt = title
             if testsplitctr >= 2:
                if testsplitctr >= 3:
                   initsplit[1] = initsplit[1] + ' '
                initsplit[1] = initsplit[1] + txt
             testsplitctr = testsplitctr + 1
    if len(initsplit) < 2:
       return ''
    # get the second part of the split array (i.e. the description text)
    text = initsplit[1]
    sentsplit = text.split('.')
    if len(sentsplit) > 1:
@@ -74,27 +95,72 @@ def jargonSaneText(text):
    text = text.replace(' . ','. ')
    text = text.replace(' .','. ')
    text = text.replace('  ',' ')
    text = filter(lambda x: x in string.printable, text)
    return text.strip()
-def jargonReadFile(filename):
+def validTitle(title):
   if title is '':
      return False
   if '\xc2' in title:
      return False
   if title.startswith("Letters"):
      return False
   if title.startswith("Comments"):
      return False
   if title.startswith("Glossary"):
      return False
   return True
 # remove any invalid characters from an entry title
 # so thst it can be saved in a filename
 def jargonSaneTitle(title):
   if '/' in title:
      title = title.replace('/','-')
   return title
 def jargonCreateEntry(title, text, outputDir):
   # create the filename for the entry
   filename = outputDir
   if not outputDir.endswith('/'):
      filename = filename + '/'
   filename = filename + jargonSaneTitle(title) + '.txt'
   # don't overwrite existing files
   if os.path.isfile(filename):
      return ''
   fp = open(filename, 'w')
   fp.write(title + '\n\n' + text + '\n')
   fp.close
   return filename
 def jargonReadFile(filename, outputDir):
    inFile = open(filename)
    buffer = ""
    for line in inFile:
        buffer = buffer + line
    parser = JargonParser()
    parser.feed(buffer)
-    if parser.title is not '' and \
+    if validTitle(parser.title) and \
       parser.bodyText is not '' and \
       len(parser.title) > 1:
-        parser.bodyText = jargonSaneText(parser.bodyText)
+       saneBodyText = jargonSaneText(parser.title, parser.bodyText)
-        print "Title: " + parser.title
+       print jargonCreateEntry(parser.title, saneBodyText, outputDir)
-        print "Text: " + parser.bodyText + "\n"
+       #if saneBodyText == "":
       #print "Title: " + parser.title
       #   print "Original: " + parser.bodyText
       #print "Text:  " + saneBodyText + "\n"
-def jargonImport(rootDir):
+def jargonImport(rootDir, excludeEntriesFilename, outputDir):
    for dirName, subdirList, fileList in os.walk(rootDir):
        for filename in fileList:
-            jargonReadFile(dirName + '/' + filename)
+            jargonReadFile(dirName + '/' + filename, outputDir)
 if __name__ == "__main__":
-    jargonImport('../original')
+    jargonImport('../original','','../entries')