import glob
import os
import pandas as pd
import re
import string


data_home = "hackathon"


age_df = pd.read_csv(
    os.path.join(data_home, "Kuperman-BRM-data-2012.csv"),
    index_col='Word')


age_df.shape

(30121, 5)


age_df.head(2)


concreteness_df = pd.read_csv(
    os.path.join(data_home, "Concreteness_ratings_Brysbaert_et_al_BRM.csv"),
    index_col='Word')


concreteness_df.shape

(39954, 8)


concreteness_df.head()


sentiment_df = pd.read_csv(
    os.path.join(data_home, "Warriner_et_al emot ratings.csv"),
    index_col='Word')


sentiment_df.shape

(13915, 64)


sentiment_df = sentiment_df[['V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]

sentiment_df = sentiment_df.rename(
    columns={'V.Mean.Sum': 'Valence',
             'A.Mean.Sum': 'Arousal',
             'D.Mean.Sum': 'Dominance'})


beauty_df = pd.read_csv(
    os.path.join(data_home, "wordbeauty.csv"),
    index_col="Word")


beauty_df.shape

(214, 4)


beauty_df.head(2)


beauty_df['Category'].value_counts()

regular           107
most-beautiful    107
Name: Category, dtype: int64


beauty_df.head()


gutenberg_home = os.path.join(data_home, "gutenberg")


gutenberg_filenames = glob.glob(os.path.join(gutenberg_home, "*.txt"))


len(gutenberg_filenames)

26


gutenberg_filenames[: 5]

['hackathon/gutenberg/blake-poems.txt',
 'hackathon/gutenberg/carroll-alice.txt',
 'hackathon/gutenberg/shakespeare-caesar.txt',
 'hackathon/gutenberg/christie-secad10.txt',
 'hackathon/gutenberg/dickens-ncklb10.txt']


def gutenberg_iterator(filename):
    """Yields paragraphs (as defined simply by multiple
    newlines in a row).

    Parameters
    ----------
    filename : str
        Full path to the file.

    Yields
    ------
    multiline str

    """
    with open(filename) as f:
        contents = f.read()
    for para in re.split(r"[\n\s*]{2,}", contents):
        yield para


emma_iterator = gutenberg_iterator(gutenberg_filenames[0])

for _ in range(5):
    print("="*50)
    print(next(emma_iterator))

==================================================
[Poems by William Blake 1789]
==================================================
SONGS OF INNOCENCE AND OF EXPERIENCE
and THE BOOK of THEL
==================================================
SONGS OF INNOCENCE
==================================================
INTRODUCTION
==================================================
Piping down the valleys wild,


from nltk.tokenize import sent_tokenize


sent_tokenize("Hello? This is Dr. Potts! How are you?")

['Hello?', 'This is Dr. Potts!', 'How are you?']


def simple_tokenize(s):
    """Break str `s` into a list of str.

    1. `s` has all of its peripheral whitespace removed.
    2. `s` is downcased with `lower`.
    3. `s` is split on whitespace.
    4. For each token, any peripheral punctuation on it is stripped
       off. Punctuation is here defined by `string.punctuation`.

    Parameters
    ----------
    s : str
        The string to tokenize.

    Returns
    -------
    list of str
    """
    punct = string.punctuation
    final_toks = []
    toks = s.lower().strip().split()
    for w in toks:
        final_toks.append(w.strip(punct))
    return final_toks


def word_counts(s, tokenizing_func=simple_tokenize):
    """Count distribution for the words in `s` according to `tokenizer`.

    Parameters
    ----------
    s : str
        String to tokenize and get  word counts for.
    tokenizing_func : function
        Any function that can be called as `tokenizing_func(s)` where
        `s` is a string. The default is `simple_tokenize`.

    Returns
    -------
    dict mapping str to int
    """
    wc = {}
    toks = tokenizing_func(s)
    for w in toks:
        wc[w] = wc.get(w, 0) + 1
    return wc


def egrep(regex, filename):
    """Python version of egrep. The function iterates through the
    user's file `filename`, line-by-line, stripping off the final
    newline character, and yielding only the lines that match the
    user's regular expression `regex`.

    Note: like basic egrep, a line that contains multiple matches for
    `regex` is yielded only once.

    Parameters
    ----------
    regex : Compiled regular expression
        The pattern to use for matching
    filename : str
        Full path to the file to open and iterate through

    Yields
    ------
    str
        Lines from the file, with newline characters removed
    """
    for line in open(filename):
        line = line.strip()
        if regex.search(line):
            yield line

	OccurTotal	OccurNum	Rating.Mean	Rating.SD	Frequency
Word
have	18	18	3.72	1.96	314232.0
do	20	20	3.60	1.60	312915.0

	Bigram	Conc.M	Conc.SD	Unknown	Total	Percent_known	SUBTLEX	Dom_Pos
Word
roadsweeper	0	4.85	0.37	1	27	0.96	0	0
traindriver	0	4.54	0.71	3	29	0.90	0	0
tush	0	4.45	1.01	3	25	0.88	66	0
hairdress	0	3.93	1.28	0	29	1.00	1	0
pharmaceutics	0	3.77	1.41	4	26	0.85	0	0

	Pronunciation	Morphology	Frequency	Category
Word
lithe	L AY1 DH	(lithe)[A]	136457	most-beautiful
vestige	V EH1 S T IH0 JH	(vestige)[N]	135247	most-beautiful

	Pronunciation	Morphology	Frequency	Category
Word
lithe	L AY1 DH	(lithe)[A]	136457	most-beautiful
vestige	V EH1 S T IH0 JH	(vestige)[N]	135247	most-beautiful
nemesis	N EH1 M AH0 S IH0 S	(nemesis)[N]	1338430	most-beautiful
inure	IH0 N Y UH1 R	(inure)[V]	123230	most-beautiful
imbue	IH0 M B Y UW1	(imbue)[V]	105790	most-beautiful

Assignment 6: Language dataset hackathon¶

Contents¶

Overview¶

Requirements¶

Ideas¶

Set-up¶

Age of acquisition dataset¶

Concreteness dataset¶

Sentiment dataset¶

Beautiful words¶

Novels from Project Gutenberg¶

Potentially useful code¶

Project Gutenberg iterator¶

Sentence tokenizing using NLTK¶

Word counts¶

egrep¶