"""
Companion code for the ENGR 108 lectures on Embeddings by Stephen Boyd and Aqib Syed
This library is for word embeddings analysis, following the style in VMLS Julia Companion 
"""
module WordEmbeddings

using CSV
using DataFrames
using VMLS

export word_embeddings_data

"""
word_embeddings_data()       

Returns a tuple '(words, emb300, emb2)' with data for the 
word embeddings example.

'words' is an array of length 10,000 with the vocabulary words.
'emb300' is an array of length 10,000 with the 300D embedding vectors.  
'emb2' is an array of length 10,000 with the 2D projection vectors.

Note: If you don't have the data files yet, run: julia src/prepare_embeddings.jl
This will download GloVe data and create the required files!
"""
function word_embeddings_data()
    # Change this path to where your data files are located
    pth = "/Users/aqibsyed/Documents/Fall 2025/ENGR 108/Embeddings/embeddings_demo/data"
    
    words = readlines(joinpath(pth, "vocab.txt"))
    
    # Read matrices using CSV.jl
    emb300_matrix = Float32.(CSV.read(joinpath(pth, "embeddings_hd_unit.csv"), DataFrame; header=false) |> Matrix)
    emb300 = [emb300_matrix[i, :] for i = 1:10000]
    
    emb2_matrix = Float32.(CSV.read(joinpath(pth, "embeddings_2d.csv"), DataFrame; header=false) |> Matrix)
    emb2 = [emb2_matrix[i, :] for i = 1:10000]
    
    return words, emb300, emb2
end

end