Linguist 278: Programming for Linguists
Stanford Linguistics, Fall 2021
Christopher Potts
import glob
import os
import pandas as pd
Download this package of example files, unzip it, and place the resulting folder in this directory:
https://web.stanford.edu/class/linguist278/data/sampledocs.zip
sampledoc_dirname = "sampledocs"
csv_example = os.path.join(sampledoc_dirname, "movie-data.csv")
For most CSV work, you can use pd.read_csv and not worry about the details:
movie_df = pd.read_csv(csv_example)
movie_df
It also works with compressed files:
csv_gz_example = os.path.join(sampledoc_dirname, "movie-data.csv.gz")
movie_df = pd.read_csv(csv_gz_example)
And the delimiter
argument will let you handle TSV and other formats, as with the built-in csv library.
To write a pd.DataFrame
df
to CSV with pandas, use df.to_csv:
movie_df.to_csv(csv_example, index=None)
movie_df.to_csv(csv_gz_example, compression="gzip", index=None)
For Excel files, use pd.read_excel for reading:
xlsx_example = os.path.join(sampledoc_dirname, "movie-data.xlsx")
movie_df = pd.read_excel(xlsx_example)
And df.to_excel for writing:
movie_df.to_excel(xlsx_example, index=None)
def csv_reader(src_filename, delimiter=","):
with open(src_filename) as f:
return csv.reader(f, delimiter=delimiter)
def csv_reader_dicts(src_filename, delimiter=","):
with open(src_filename) as f:
return csv.DictReader(f, delimiter=delimiter)
def csv_writer(rows, output_filename, header=None):
with open(output_filename, 'wt') as f:
writer = csv.writer(f)
if header is not None:
writer.writerow(header)
writer.writerows(rows)
Installing textract: https://textract.readthedocs.io/en/stable/installation.html
import textract
docx_example = os.path.join(sampledoc_dirname, "ling278_stanfordtools.docx")
def docx_reader(src_filename):
return textract.process(docx_example).decode()
docx_text = docx_reader(docx_example)
print(docx_text[: 200])
textract will also do its best with a wide variety of other file formats!
import gzip
def gzip_reader(src_filename):
with gzip.open(src_filename, mode='rt', encoding='utf8') as f:
for line in f:
yield line
def gzip_writer(s, output_filename):
with gzip.open(src_filename, mode='wb') as f:
f.write(s.encode(encoding="utf8"))
list(gzip_reader(csv_gz_example))
See my notebook on this topic:
http://web.stanford.edu/class/linguist278/notes/ling278_scraping_solved.ipynb [HTML version]
The key libraries are requests and Beautiful Soup.
JSON is a common data format these days. It is more flexible than CSV because it allows nesting of objects and built-in typing of objects. It can handle str, int, float, list, and dict.
The limitations of JSON are what make it very portable: it stores things in plain-text and uses objects that all modern programming languages have.
For other Python objects, you need to resort to pickle, which is not portable outside of Python.
import json
json_example = os.path.join(sampledoc_dirname, "toy-json.json")
j = [
{"a": 1, "b": 2.45, "c": [1,2,3], "d": {"dd": True}},
{"f": 7},
True
]
def json_writer(d, output_filename):
with open(output_filename, "wt") as f:
json.dump(d, f, indent=4, sort_keys=True)
json_writer(j, json_example)
def json_reader(src_filename):
with open(src_filename, "rt") as f:
return json.load(f)
j = json_reader(json_example)
def read_jsonl(src_filename):
data = []
with open(src_filename) as f:
for line in f:
d = json.loads(line)
data.append(d)
return data
def write_jsonl(data, output_filename):
lines = ""
for d in data:
s = json.dumps(d)
lines += s + "\n"
with open(output_filename, "wt") as f:
f.write(lines)
jsonl_example = os.path.join(sampledoc_dirname, "movie-data.jsonl")
movies = read_jsonl(jsonl_example)
movies[0]
write_jsonl(movies, jsonl_example)
To write JSONL from a pd.DataFrame
: `df.to_json(output_filename, orient="records", lines=True)
!pip install pymupdf
import fitz
from fitz.utils import getColor
def pdf2text(src_filename):
"""Open a PDF file and extract its page contents, returning a list of str."""
data = []
doc = fitz.open(src_filename)
for page in doc:
contents = page.getText('text')
data.append(contents)
return data
pdf_example = os.path.join(sampledoc_dirname, "ling278_stanfordtools.pdf")
pdf_text = pdf2text(pdf_example)
print(pdf_text[0][: 200])
def pdf_highlighter(span, src_filename, output_filename, color="tomato"):
doc = fitz.open(src_filename)
rgb = fitz.utils.getColor(color)
for page in doc:
for inst in page.searchFor(span):
ann = page.addHighlightAnnot(inst)
ann.setColors({"stroke": rgb})
info = ann.info
info["title"] = "Interesting! Tell me more!"
ann.setInfo(info)
ann.update()
doc.save(output_filename, garbage=4, deflate=True, clean=True, expand=0)
pdf_output_filename = os.path.join(sampledoc_dirname, "ling278_stanfordtools-highlighting.pdf")
pdf_highlighter("Stanford", pdf_example, pdf_output_filename)
For PDF and other image files that don't have embedded text, you have to do more advanced pre-processing to identify text. I recommend the open-source library tesseract for this.
The pickle library has an interface that is very similar to JSON. However, whereas JSON is a plain-text format that is highly limited in what it can store, you can pickle just about any Python data structure. This makes it really useful for quickly storing large data structures, and you can store them with associated code. The only downside is that pickle is not a portable format. It can be read only by Python, and there can even be problems reading and writing pickle files across Python versions. So you might think of pickle as your own private, temporary storage format.
import pickle
pickle_example = os.path.join(sampledoc_dirname, "toy-pickle.pickle")
# Imagine this is a huge dictionary that took your computer
# all night to build, and you just want to stash it so that you
# don't have to keep rebuilding it.
d = {"a": 1, "b": 2.45, "c": [1, 2, 3], "d": {"dd": True}}
def write_pickle(pyobj, output_filename):
with open(output_filename, "wb") as f:
pickle.dump(pyobj, f)
write_pickle(d, pickle_example)
def read_pickle(src_filename):
with open(src_filename, "rb") as f:
return pickle.load(f)
d = read_pickle(pickle_example)
def exponent(x, pow=2):
return x**pow
pickled_exponent = os.path.join(sampledoc_dirname, "exponent.pickle")
write_pickle(exponent, pickled_exponent)
exponent2 = read_pickle(pickled_exponent)
exponent2(4) # Returns 16.
import zipfile
zip_example = os.path.join(sampledoc_dirname, "gutenberg.zip")
def open_zipfile(src_filename, output_dirname, file_to_open=None):
with zipfile.ZipFile(src_filename) as f:
if file_to_open is None:
f.extractall(path=output_dirname)
else:
f.extract(file_to_open, path=output_dirname)
open_zipfile(
zip_example,
sampledoc_dirname,
file_to_open=os.path.join("gutenberg", "austen-emma.txt"))
open_zipfile(
zip_example,
sampledoc_dirname)
def write_zipfile(src_filenames, output_filename):
with zipfile.ZipFile(output_filename, "w") as f:
for filename in src_filenames:
f.write(filename, os.path.basename(filename))
movie_filenames = glob.glob(os.path.join(sampledoc_dirname, "movie-data.*"))
write_zipfile(movie_filenames, "movie-data.zip")