# A script to build the annotation files.

buildDPkg <- function(pkgName, 
pkgPath ="/misc/homes/jzhang/madman/datapkgs" ){

sourceURL <- c("www.ncbi.nlm.nih.gov/UniGene",
"www.ncbi.nlm.nih.gov/LocusLink", "www.genome.ucsc.edu", 
"dot.ped.med.umich.edu:200/ourimage/microarrays/Affy_annot/UniGene/index.html",
"www.genome.ad.jp/kegg", "www.biostat.harvard.edu/complab/dchip",
"www.geneontology.org")

library(AnnBuilder)
path <- .path.package("AnnBuilder")
#define data sources
ll <- "ftp://ftp.ncbi.nih.gov/refseq/LocusLink/LL_tmpl.gz"
switch(pkgName,
       "hu6800" = ,
       "hgu133a" = ,
       "hgu95a" = ug <- 
       "ftp://ftp.ncbi.nih.gov/repository/UniGene/Hs.data.gz",
       "mgu74a" = ug
       <-"ftp://ftp.ncbi.nih.gov/repository/UniGene/Mm.data.gz",
       "rgu34a" = ug   
       <-"ftp://ftp.ncbi.nih.gov/repository/UniGene/Rn.data.gz")

putIn <- NULL
switch(pkgName,
       "hu6800" = ,
       "hgu133a" = ,
       "hgu95a" = putIn <- "05apr2002",
       "mgu74a" = putIn <- "mmFeb2002")

refLink <- paste("http://genome.cse.ucsc.edu/goldenPath/",
                 putIn, "/database/refLink.txt.gz", sep = "")
refGene <- paste("http://genome.cse.ucsc.edu/goldenPath/",
                 putIn, "/database/refGene.txt.gz", sep = "")
go <- "ftp://ftp.geneontology.org/pub/go-xml/go_2002-08-termdb.xml.gz"

# Download data files from sources
llSource <- fileFetcher(ll)
ugSource <- fileFetcher(ug)
if(!is.null(putIn)){
    linkSource <- fileFetcher(refLink)
    geneSource <- fileFetcher(refGene)
}
goSource <- fileFetcher(go)
# Get the unified mapping between a given id to LocusLink id
llNParser <- paste(llSource, "=", file.path(path, "data",
"llAffyLLParser"), sep = "")
ugNParser <- paste(ugSource, "=", file.path(path, "data",
"ugAffyLLParser"), sep = "")

if(pkgName == "hgu95a"){
    umichFile <- file.path(path, "data", "hgu95av2_UMich")
    dchipFile <- file.path(path, "data", "hgu95av2_Cheng")
    tobeMapped <-  c(llNParser, ugNParser, umichFile, dchipFile)
    baseF <- file.path(path, "data", "hgu95av2id")
}else if(pkgName == "hu6800"){
    umichFile <- file.path(path, "data", "hu6800_UMich")
    dchipFile <- file.path(path, "data", "hu6800_Cheng")
    tobeMapped <-  c(llNParser, ugNParser, umichFile, dchipFile)
    baseF <- file.path(path, "data", "hu6800id")
}else if(pkgName == "hgu133a"){
    umichFile <- file.path(path, "data", "hgu133a_UMich")
    dchipFile <- file.path(path, "data", "hgu133a_Affy")
    tobeMapped <-  c(llNParser, ugNParser, umichFile, dchipFile)
    baseF <- file.path(path, "data", "hgu133aid")
}else if(pkgName == "mgu74a"){
    umichFile <- file.path(path, "data", "mgu74a_UMich")
    dchipFile <- file.path(path, "data", "mgu74a_Cheng")
    tobeMapped <-  c(llNParser, ugNParser, umichFile, dchipFile)
    baseF <- file.path(path, "data", "mgu74aid")
}else if(pkgName == "rgu34a"){
    umichFile <- file.path(path, "data", "rgu34a_UMich")
    dchipFile <- file.path(path, "data", "rgu34a_Cheng")
    tobeMapped <-  c(llNParser, ugNParser, umichFile, dchipFile)
    baseF <- file.path(path, "data", "rgu34aid")
}

testUnified <- file.path(path, "temp","unifiedMapping")
acc2LLMapper(outName = testUnified, dbName = "sandBox",
                           tName = "abbuildmap", baseFile = baseF,
                           toMap = tobeMapped,
                           baseCols = c("id", "acc"), byID = "id")
# Parse the LocusLink data file using the unified mapping and a base file
testLL <- file.path(path, "temp", "buildLL")
llParser <- file.path(path, "data", "llParser")
fileParser(outName = testLL, baseFile = testUnified, dataFile =
           llSource, parser =  llParser, isDir = FALSE)
# Parse chromosomal location and orientation data
if(!is.null(putIn)){
    testLink <- file.path(path, "temp","buildLink")
    linkParser <- file.path(path, "data", "refLinkParser")
    fileParser(outName = testLink, baseFile = testUnified, dataFile =
           linkSource, parser = linkParser, isDir = FALSE)
    testGene <- file.path(path, "temp", "buildGene")
    geneParser <- file.path(path, "data", "refGeneParser")
    fileParser(outName = testGene, baseFile = testLink, dataFile =
           geneSource, parser = geneParser, isDir = FALSE)
    ## Some gene location data may end up with no chromosome number if
    ## only chromosome number data from LocusLink are used. Get the
    ## from the refGene file also.
    geneChr <-  file.path(path, "temp", "geneChr")
    geneParser <- file.path(path, "data", "refGeneChrParser")
    fileParser(outName = geneChr, baseFile = testLink, dataFile =
           geneSource, parser = geneParser, isDir = FALSE)
    # merge the two parsed files
    f1Col <- c("affy", "acc", "locusid", "unigeneid", "name", "symbol",
           "chrom", "cyto", "pmid", "grif","sumfun", "go")
    f2Col <- c("affy", "chrlocation", "chrorientation")
    testmerged <- file.path(path, "temp", "buildMerged")
    mergeFiles(file1 = testLL, file2 = testGene, file1Col = f1Col,
           file2Col = f2Col, idCol = "affy", outName = testmerged,
           sep = "\t", header = FALSE, isFile = TRUE)
}else{
    testmerged <- testLL
}
# Get pathway data and map them to target ids
switch(pkgName,
       "hu6800" = ,
       "hgu133a" = ,
       "hgu95a" = organism <- "human",
       "mgu74a" = organism <- "mouse",
       "rgu34a" = organism <- "rat")
pathData <- file.path(path, "temp", "humanPath")
getPathway(pathData, organism = organism)
mappedPath <- file.path(path, "temp", "buildPath")
mapPathway(mappedPath, geneFile = testUnified, pathFile = pathData,
geneColNames = c("affy", "acc", "ll"), pathColNames = c("ll", "path",
"enzyme"), mapId = "ll", colToKeep = c("affy", "path", "enzyme"),
geneSep = "\t", pathSep = "\t")
# Merge pathway data with previously parsed file
if(!is.null(putIn)){
    f1Col <- c("affy", "acc", "locusid", "unigeneid", "name",
               "symbol","chrom", "cyto", "pmid", "grif","sumfun", "go",
               "chrolocation","chroorientation")
}else{
    f1Col <- c("affy", "acc", "locusid", "unigeneid", "name", "symbol",
           "chrom", "cyto", "pmid", "grif","sumfun", "go")
}
f2Col <- c("affy", "path", "enzyme")
mergeFiles(file1 = testmerged, file2 =  mappedPath, file1Col = f1Col,
           file2Col = f2Col, idCol = "affy", outName = testmerged,
           sep = "\t", header = FALSE, isFile = TRUE)
# Produce the XM file (testXML1) containing gene information
testXML1 <- file.path(path, "temp", paste(pkgName, ".xml", sep = ""))
if(!is.null(putIn))
    colNames <- c("AFFY", "ACCNUM", "LOCUSID", "UNIGENE", "GENENAME",
                  "SYMBOL","CHR", "MAP", "PMID", "GRIF", "SUMFUNC",
                  "GO", "CHRLOC", "CHRORI", "PATH", "ENZYME")
else
    colNames <- c("AFFY", "ACCNUM", "LOCUSID", "UNIGENE", "GENENAME",
                  "SYMBOL","CHR", "MAP", "PMID", "GRIF", "SUMFUNC",
                  "GO", "PATH", "ENZYME")
multC <- c("PMID", "GO", "CHRLOC", "CHRORI", "PATH", "ENZYME", "CHR","MAP")
typeC <- c("GENENAME", "SYMBOL")
multS <- ";"
typeS <- ";"
fileToXML(outName = testXML1, fileName = testmerged, fileCol =
                       colNames, name = pkgName, version =
                       "1.0.1", multCol = multC, typeCol = typeC,
                       multSep = multS, typeSep = typeS, fileSep =
                       "\t", whichOne = "ll", header = FALSE, isFile = TRUE)
xml2DataPkg(fileName = testXML1, pkgName = pkgName, urls = sourceURL,
path = pkgPath)
# Get the reverse mapping between PubMed, pathway, enzyme and Affy ids
if(pkgName == "hgu95a"){
    hgu95aPMID2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PMID", valueColName = "AFFY")
    hgu95aPATH2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PATH", valueColName = "AFFY")
    hgu95aENZYME2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "ENZYME", valueColName = "AFFY")
    assign("hgu95aPMID2AFFY", hgu95aPMID2AFFY, parent.frame()) 
    assign("hgu95aPATH2AFFY", hgu95aPATH2AFFY, parent.frame())
    assign("hgu95aENZYME2AFFY", hgu95aENZYME2AFFY, parent.frame()) 

#    if(!is.null(putIn)){
#        x <- read.table(testmerged, header = FALSE, as.is = TRUE,
#                        sep = "\t", quote = "", comment.char = "")
#        y <- read.table(geneChr, header = FALSE, as.is = TRUE,
#                        sep = "\t", quote = "", comment.char = "")
#        names(x) <- colNames
#        names(y) <- c("AFFY", "CHR")
#        hgu95aCHR <- fillCol(x, y, "AFFY", "CHR", "AFFY", "CHR")
#        environment(hgu95aCHR) <- parent.frame()
#        makeRdaFile("CHR", pkgName, pkgPath)
#    }
}else if(pkgName == "hu6800"){
    hu6800PMID2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PMID", valueColName = "AFFY")
    hu6800PATH2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PATH", valueColName = "AFFY")
    hu6800ENZYME2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "ENZYME", valueColName = "AFFY")
    assign("hu6800PMID2AFFY", hu6800PMID2AFFY, parent.frame()) 
    assign("hu6800PATH2AFFY", hu6800PATH2AFFY, parent.frame())
    assign("hu6800ENZYME2AFFY", hu6800ENZYME2AFFY, parent.frame()) 	
}else if(pkgName == "hgu133a"){
    hgu133aPMID2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PMID", valueColName = "AFFY")
    hgu133aPATH2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PATH", valueColName = "AFFY")
    hgu133aENZYME2AFFY<- file2Env(testmerged, colNames, keyColName =
                       "ENZYME", valueColName = "AFFY")
    assign("hgu133aPMID2AFFY", hgu133aPMID2AFFY, parent.frame()) 
    assign("hgu133aPATH2AFFY", hgu133aPATH2AFFY,parent.frame())
    assign("hgu133aENZYME2AFFY", hgu133aENZYME2AFFY,parent.frame()) 	
}else if(pkgName == "mgu74a"){
    mgu74aPMID2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PMID", valueColName = "AFFY")
    mgu74aPATH2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PATH", valueColName = "AFFY")
    mgu74aENZYME2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "ENZYME", valueColName = "AFFY")
    assign("mgu74aPMID2AFFY", mgu74aPMID2AFFY, parent.frame()) 
    assign("mgu74aPATH2AFFY", mgu74aPATH2AFFY, parent.frame())
    assign("mgu74aENZYME2AFFY", mgu74aENZYME2AFFY, parent.frame()) 	
}else if(pkgName == "rgu34a"){
    rgu34aPMID2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PMID", valueColName = "AFFY")
    rgu34aPATH2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "PATH", valueColName = "AFFY")
    rgu34aENZYME2AFFY <- file2Env(testmerged, colNames, keyColName =
                       "ENZYME", valueColName = "AFFY")
    assign("rgu34aPMID2AFFY", rgu34aPMID2AFFY, parent.frame()) 
    assign("rgu34aPATH2AFFY", rgu34aPATH2AFFY, parent.frame())
    assign("rgu34aENZYME2AFFY", rgu34aENZYME2AFFY, parent.frame()) 	
}

makeRdaFile("PMID2AFFY", pkgName, pkgPath, envir = parent.frame())
makeRdaFile("PATH2AFFY", pkgName, pkgPath, envir = parent.frame())
makeRdaFile("ENZYME2AFFY", pkgName, pkgPath, envir = parent.frame())
# Process the GO data file
GOXMLParser(dbName = "sandBox", tName = "abtestgoorig", fileName = goSource)
goNAffy <- file.path(path, "temp", "testgoNaffy")
goParser <- file.path(path, "data", "affyGOParser")
fileParser(outName = goNAffy, baseFile=testUnified, dataFile =
llSource, parser = goParser, isDir = FALSE)
go2GeneMapper(dbName = "sandBox", tName = "abtestgo", fileName = goNAffy,
              goTName = "abtestgoorig", exclude = "GO:0003673")
testXMLByNum <- file.path(path, "temp", paste(pkgName, "ByNum.xml",
                                              sep = ""))
makeGOByNum(dbName = "sandBox", tName = "abtestgo", outName =
            testXMLByNum,tColNames = c("go",
                         "geneid","genes","total"), fNames =
            c("GO","GO2AFFY", "GO2ALLAFFY", "AFFYCOUNTS"),multCol =
            c("GO2AFFY","GO2ALLAFFY"))
xml2DataPkg(fileName = testXMLByNum, pkgName = pkgName, path = pkgPath, rdaOnly = TRUE)

#system(paste("cd", file.path(path, "wwwfiles")))
#system("rm *.*")
#system(paste("cd", getwd()))
}

buildGO <- function(pkgName = "GO", pkgPath
="/misc/homes/jzhang/madman/datapkgs"){ 

dataEnv <- new.env(hash = TRUE, parent = NULL)

x <- list(GName = c("molecular_function", "biological_process",
      "cellular_component"), dbName = "sandBox", tName = "abtestgoorig",
      idColName = "go", termColName = "ontology", typeColName = "goType",
      PColName = "parentgoid", topIds =
      c("GO:0003674","GO:0005575","GO:0008150"), version = "1.0.1")
makeGODataPkg(x, "GO", path = pkgPath)
}
















