JargonFile/import/importjargon.py

import os
import HTMLParser, urllib, urlparse

class JargonFile(dict):
   def __init__(self,*arg,**kw):
      super(CustomDictOne, self).__init__(*arg, **kw)

   def __getitem__(self, key):
       val = dict.__getitem__(self, key)
       print 'GET', key
       return val

   def __setitem__(self, key, val):
       print 'SET', key, val
       dict.__setitem__(self, key, val)

   def __repr__(self):
       dictrepr = dict.__repr__(self)
       return '%s(%s)' % (type(self).__name__, dictrepr)

   def update(self, *args, **kwargs):
       print 'update', args, kwargs
       for k, v in dict(*args, **kwargs).iteritems():
           self[k] = v

class JargonParser(HTMLParser.HTMLParser):
    def __init__ (self):
        HTMLParser.HTMLParser.__init__ (self)
        self.seen = {}
        self.currentSection=''
        self.title = ''
        self.bodyText = ''
    def handle_data(self, data):
        if "head" in self.currentSection:
            # store the title
            self.title = data
            self.bodyText = '';
        elif "body" in self.currentSection:
            replacements = ['    ','   ','  ','\t','\r','\n']
            for rep in replacements:
                data = data.replace(rep,' ')
            data = data.strip()
            self.bodyText = self.bodyText + data + ' '
    def handle_starttag(self, tag, attributes):
        if "head" in tag or "body" in tag:
            self.currentSection = tag;

def jargonSaneText(text):
    if len(text) < 2:
        return ''

    initsplit = text.split(' : ')
    if len(initsplit) < 2:
        return ''

    initial = True
    newtext = ''
    for txt in initsplit:
        if not initial:
            newtext = newtext + txt
        initial = False
    text = newtext

    sentsplit = text.split('.')
    if len(sentsplit) > 1:
        ctr = 0
        newtext = ''
        for sent in sentsplit:
            if ctr < len(sentsplit)-1:
                newtext = newtext + sent + '.'
            ctr = ctr + 1
        text = newtext

    text = text.replace(' . ','. ')
    text = text.replace(' .','. ')
    text = text.replace('  ',' ')

    return text.strip()

def jargonReadFile(filename):
    inFile = open(filename)
    buffer = ""
    for line in inFile:
        buffer = buffer + line
    parser = JargonParser()
    parser.feed(buffer)
    if parser.title is not '' and \
       parser.bodyText is not '' and \
       len(parser.title) > 1:
        parser.bodyText = jargonSaneText(parser.bodyText)
        print "Title: " + parser.title
        print "Text: " + parser.bodyText + "\n"

def jargonImport(rootDir):
    for dirName, subdirList, fileList in os.walk(rootDir):
        for filename in fileList:
            jargonReadFile(dirName + '/' + filename)

if __name__ == "__main__":
    jargonImport('../original')