#!/usr/bin/env python3

"""
Stanford CS106A WordCount Example
Nick Parlante

Counting the words in a text file is a sort
of Rosetta Stone of programming, demonstracting the use of files, dicts,
functions, loops, logic, decomposition, testing, and main().

Code is provided for alphabetical output like:
$ python3 wordcount.py somefile.txt
aardvark 1
anvil 3
boat 4
...

**Exercise**

Implement code in print_top() to print the n most common words,
using sorted/lambda/items.

Then command line -top n feature calls print_top() for output like:
$ python3 wordcount.py -top 10 alice-book.txt
the 1639
and 866
to 725
a 631
she 541
it 530
of 511
said 462
i 410
alice 386
"""

import sys


def clean(s):
    """
    Given string s, returns a clean version of s where all non-alpha
    chars are removed from beginning and end, so '@@hi^^' yields 'hi'.
    The resulting string will be empty if there are no alpha chars.
    (provided code)
    >>> clean('$abc^')
    'abc'
    >>> clean('Abc$-$')
    'Abc'
    >>> clean('^x.')
    'x'
    >>> clean('ab-cd.')
    'ab-cd'
    >>> clean('$$$')
    ''
    >>> clean('')
    ''
    """
    # Doctest notes: start with basic like '$abc^',
    # then edge cases like 'abc' '$$$'

    # Move start rightwards, past non-alpha punctuation
    start = 0
    while start < len(s) and not s[start].isalpha():
        start += 1

    # Move end leftwards, past non-alpha
    end = len(s) - 1
    while end >= start and not s[end].isalpha():
        end -= 1

    # start/end cross each other -> nothing left
    if end < start:
        return ''
    return s[start:end + 1]


def read_counts(filename):
    """
    Given filename, reads its text, splits it into words.
    Returns a "counts" dict where each word seen
    is a key and its value is the int count
    number of times it appears in the text.
    Converts each word to a "clean", lowercase
    version of that word.
    (provided code)
    >>> read_counts('test1.txt')
    {'a': 2, 'b': 2}
    >>> read_counts('test2.txt')
    {'b': 1, 'a': 2}
    >>> read_counts('test3.txt')
    {'bob': 1}
    """
    counts = {}
    # Standard file code: open file, loop to process each line
    with open(filename) as f:
        for line in f:
            line = line.strip()
            # Split the line into words, loop to process each word
            words = line.split()  # split() with no params -> splits on whitespace
            for word in words:
                word = word.lower()
                word = clean(word)
                if word != '':      # tricky - cleaning may leave only ''
                    if word not in counts:
                        counts[word] = 0
                    counts[word] += 1
    return counts
    # Alternately could f.read() the whole file into one giant string
    # for processing without the for/line loop, since word-counting
    # does not depend on separating the lines.
    # text = f.read()
    # words = text.split()


def print_counts(counts):
    """
    Given counts dict, print out each word and count
    one per line in alphabetical order, like this
    aardvark 1
    apple 13
    ...
    (provided code)
    """
    for word in sorted(counts.keys()):
        print(word, counts[word])
    # Alternately can use counts.items() to access all key/value pairs
    # in one step.
    # for key, value in sorted(counts.items()):
    #    print(key, value)


def print_top(counts, n):
    """
    (Exercise)
    Given counts dict and int n, print the n most common words
    in decreasing order of count
    the 1639
    and 866
    to 725
    ...
    """
    items = counts.items()
    # To get a start writing the code, could print raw items to
    # get an idea of what we have.
    # print(items)

    # Your code here - our solution is 3 lines long, but it's dense!
    # Hint:
    # Sort the items with a lambda so the most common words are first.
    # Then print just the first n word,count pairs
    pass
    # 1. Sort largest count first
    items = sorted(items, key=lambda pair: pair[1], reverse=True)
    # 2. Loop over slice of first n
    for word, count in items[:n]:
        print(word, count)


def main():
    # (provided)
    # Command line forms
    # 1. filename
    # 2. -top n filename   # prints n most common words
    args = sys.argv[1:]

    if len(args) == 1:
        # filename
        counts = read_counts(args[0])
        print_counts(counts)

    if len(args) == 3 and args[0] == '-top':
        # -top n filename
        n = int(args[1])
        counts = read_counts(args[2])
        print_top(counts, n)


if __name__ == '__main__':
    main()
