#!/usr/bin/env python

import re
import sys
#sys.path.append('path/to/folder/containing/cmu')
#import cmu
import nltk
from nltk.stem import WordNetLemmatizer

class LingStruc:
    def __init__(self, s):
        self.s = s

    def tokenize(self, stem=False, phones=False):
        """Maybe use your own"""
        return nltk.word_tokenize(self.s)        

    def paragraphs(self):
        """Regex job."""
        return re.split(r"(?:\n\s*){2,}", self.s)

    def sentences(self):
        SENT_TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = SENT_TOKENIZER.tokenize(self.s)
        return sents

    def stem(self, word, pos):
        wnl = WordNetLemmatizer()
        stemmed = wnl.lemmatize(word, pos)        
        # Should this be (stemmed, pos) instead?
        return stemmed

    def lemmas(self):
        toks = self.tokenize()
        lems = nltk.pos_tag(toks)
        return lems

    def parse(self):
        pass

    def quotations(self):
        """Regex job."""
        pass
        
    def mean_sentence_length(self):
        pass

    def mean_word_length(self):
        pass
    
    def genre(self):
        pass

    
if __name__ == '__main__':

    example_filename = 'nltk_data/corpora/gutenberg/austen-emma.txt'

    s = open(example_filename).read()
    
    ling = LingStruc(s)

    for p in ling.paragraphs():
        print "======================================================================"
        print p
