DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Dsv.fs

05.02.2011
| 1978 views |
  • submit to reddit
        // library for working with delimiter-separated values files

module Dsv
open System
open System.IO

// An F# mutually-tail-recursive CSV record parser.
// See the spec at http://creativyst.com/Doc/Articles/CSV/CSV01.htm
let rec csvrecord sep (tr:#TextReader) record (line:string) i =
    if i = line.Length then record @ [""]
    else
        match line.[i] with
        | '"' -> csvfield sep tr record "" line (i+1)
        | ' ' | '\t' -> csvrecord sep tr record line (i+1)
        |  c  when c = sep -> csvrecord sep tr (record @ [""]) line (i+1)
        | '=' when line.[i+1] = '"' -> csvfield sep tr record "" line (i+2) // Excel compatibility
        | _ -> // unquoted field data
            let fs = line.IndexOf(sep,i)
            if fs = -1 then record @ [line.Substring(i).TrimEnd()]
            else
                csvrecord sep tr (record @ [line.Substring(i,fs-i).TrimEnd()]) line (fs+1)
and csvfield sep (tr:#TextReader) record field (line:string) i =
    if i = line.Length then csvfield sep tr record (field+"\n") (tr.ReadLine()) 0
    elif line.[i] <> '"' then
        let q = line.IndexOf('"',i)
        if q = -1 then csvfield sep tr record (field+line.Substring(i)+"\n") (tr.ReadLine()) 0
        else csvfield sep tr record (field+line.Substring(i,q-i)) line q
    elif i = line.Length-1 then record @ [field]
    elif line.[i+1] = '"' then csvfield sep tr record (field+"\"") line (i+2)
    elif line.[i+1] = sep then csvrecord sep tr (record @ [field]) line (i+2)
    else // not an escaped quote and not end of field; try to recover by appending trimmed unquoted field data
        let fs = line.IndexOf(sep,i+1)
        if fs = -1 then record @ [field+line.Substring(i).TrimEnd()]
        else csvrecord sep tr (record @ [field+line.Substring(i,fs-i).TrimEnd()]) line (fs+1)

let csvrows sep (filepath:string) = seq {
        use sr = new StreamReader(filepath)
        while not sr.EndOfStream do
            yield csvrecord sep sr [] (sr.ReadLine()) 0
    }

let csvrecords sep (filepath:string) = seq {
        let lists = csvrows sep filepath
        let headers = Seq.head lists
        for vals in (Seq.skip 1 lists) do
            yield List.zip headers vals |> Map.ofList
    }

let tsvrows (sep:char) (filepath:string) = seq {
        use sr = new StreamReader(filepath)
        while not sr.EndOfStream do
            yield sr.ReadLine().Split(sep) |> List.ofArray
    }

let rows (filepath:string) =
    let filename = Path.GetFileName(filepath)
    match Path.GetExtension(filename).ToLower() with 
    | ".csv" -> csvrows ',' filepath
    | ".tsv" | ".tab" -> tsvrows '\t' filepath
    | ".skv" -> csvrows ';' filepath
    | ".psv" -> tsvrows '|' filepath
    | ".log" when filename.StartsWith("ex") || filename.StartsWith("u_ex") -> tsvrows ' ' filepath
    | _ -> 
        use sr = new StreamReader(filepath)
        let line = sr.ReadLine()
        let sep = ['\t';'|';';';','] |> List.sortBy (fun c -> -line.Split(c).Length) |> List.head
        if sep = ',' || sep = ';' then
            csvrows sep filepath
        else
            tsvrows sep filepath

let headersFrom (filepath:string) =
    Seq.head (rows filepath)

let records (filepath:string) = seq {
        let lists = rows filepath
        let headers = Seq.head lists
        for vals in (Seq.skip 1 lists) do
            yield List.zip headers vals |> Map.ofList
    }

let listOfFileLines filepath =
    File.ReadAllLines(filepath) |> List.ofArray

let setOfFileLines filepath =
    File.ReadAllLines(filepath) |> Set.ofArray

let listOfColnum filepath colnum =
    rows filepath |> Seq.map (fun x -> x.[colnum]) |> List.ofSeq

let setOfColnum filepath colnum =
    rows filepath |> Seq.map (fun x -> x.[colnum]) |> Set.ofSeq

let listOfColumn filepath column =
    records filepath |> Seq.map (fun x -> x.[column]) |> List.ofSeq

let setOfColumn filepath column =
    records filepath |> Seq.map (fun x -> x.[column]) |> Set.ofSeq

let mapOfColnums filepath keycolnum valcolnum =
    rows filepath |> Seq.map (fun x -> (x.[keycolnum],x.[valcolnum])) |> Map.ofSeq

let mapOfColumns filepath keycol valcol =
    records filepath |> Seq.map (fun x -> (x.[keycol],x.[valcol])) |> Map.ofSeq

let countOfColnum filepath colnum =
    listOfColnum filepath colnum |> Seq.countBy (fun x -> x) |> Seq.sortBy (fun (v,c) -> -c)

let countOfColumn filepath column =
    listOfColumn filepath column |> Seq.countBy (fun x -> x) |> Seq.sortBy (fun (v,c) -> -c)

let showCountOfColnum filepath colnum =
    listOfColnum filepath colnum |> Seq.countBy (fun x -> x) |> Seq.sortBy (fun (v,c) -> -c) |> Seq.iter (fun (v,c) -> printfn "%5d  %s" c v)

let showCountOfColumn filepath column =
    listOfColumn filepath column |> Seq.countBy (fun x -> x) |> Seq.sortBy (fun (v,c) -> -c) |> Seq.iter (fun (v,c) -> printfn "%5d  %s" c v)

let showRowsAsList filepath =
    let rs = rows filepath |> Seq.take 3 |> Seq.toList
    let fmtfld (f:string) = f.Replace('n',' ').PadRight(20).Substring(0,20)
    List.iteri (fun i _ -> printfn "%s" (List.fold (fun c (r:string list) -> c + "  " + (fmtfld r.[i])) ((string i).PadLeft(3)) rs)) rs.[0]

let writeSet filepath set =
    File.WriteAllLines(filepath,(Set.toArray set))

let writeRow sep (tw:#TextWriter) record =
    let separator = sep.ToString()
    tw.WriteLine(List.reduce (fun (x:string) y -> x + separator + y) record)

let writeCsvRows sep (tw:#TextWriter) record =
    let quotefield sep (field:string) =
        if field.IndexOfAny([|sep;'"';'n'|]) <> -1 || field <> field.Trim() then
            "\"" + field.Replace("\"","\"""""\"") + "\""
        else
            field
    writeRow sep tw (List.map (quotefield sep) record)

let writeXlRow (tw:#TextWriter) record =
    let xlquotefield (field:string) =
        if field <> field.Trim() then
            "="" + field.Replace("\"","\"""""\"") + "\""
        elif Array.forall (fun c -> Char.IsDigit(c)) (field.ToCharArray()) then
            if field.Length > 15 || field.StartsWith("0") then "="" + field + "\""
            else field
        elif field.IndexOfAny([|',';'"';'n'|]) <> -1 then
            "\"" + field.Replace("\"","\"""""\"") + "\""
        else
            field
    writeRow ',' tw (List.map xlquotefield record)

let writeRows (filepath:string) lists =
    use sw = new StreamWriter(filepath)
    if filepath.EndsWith(".xl.csv") then
        Seq.iter (writeXlRow sw) lists
    else
        match Path.GetExtension(filepath) with
        | ".csv" -> Seq.iter (writeCsvRows ',' sw) lists
        | ".skv" -> Seq.iter (writeCsvRows ';' sw) lists
        | ".psv" -> Seq.iter (writeRow '|' sw) lists
        | ".tsv" | ".tab" | _ -> Seq.iter (writeRow '\t' sw) lists

let writeRecords (filepath:string) headers maps =
    let recordvals (map:Map<string,string>) =
        List.map (fun header -> map.[header]) headers
    writeRows filepath (Seq.append (seq[headers]) (Seq.map recordvals maps))

let writeMapJoin (filepath:string) (keyfield:string) (newcolname:string) (map:Map<string,string>) =
    let extprefix f p = Path.ChangeExtension(f,p+Path.GetExtension(f))
    let joins (record:Map<string,string>) = map.ContainsKey(record.[keyfield])
    let joined, unjoined = records filepath |> List.ofSeq |> List.partition joins
    let join (record:Map<string,string>) = record.Add(newcolname,map.[record.[keyfield]])
    List.map join joined |> writeRecords (extprefix filepath newcolname) ((headersFrom filepath) @ [newcolname])
    writeRecords (extprefix filepath "unmatched") (headersFrom filepath) unjoined

let writeFileJoin filepath keyfield mapfile keycol valcol =
    writeMapJoin filepath keyfield valcol (mapOfColumns mapfile keycol valcol)

let rowsKeysDiffExclude filepath1 filepath2 (keys:Set<int>) (exclude:Set<int>) =
    let keyvals (row:string list) = Set.map (fun i -> row.[i]) keys |> Set.toList
    let load = rows >> Seq.map (fun (r:string list) -> keyvals r, r) >> Map.ofSeq
    let f1, f2 = load filepath1, load filepath2
    let both, f1only = Map.partition (fun k r -> f2.ContainsKey(k)) f1
    let f2only = Map.filter (fun k r -> not(f1.ContainsKey(k))) f2
    let rec diffrow (rowA:string list) (rowB:string list) i diffs =
        if i < rowA.Length then
            if rowA.[i] <> rowB.[i] && not(exclude.Contains(i)) then
                diffrow rowA rowB (i+1) ((keyvals rowA @ [string (i+1);rowA.[i];rowB.[i]]) :: diffs)
            else
                diffrow rowA rowB (i+1) diffs
        else
            diffs
    let rowdiffs = Map.toList both |> List.collect (fun (k,v) -> diffrow v f2.[k] 0 [])
    f1only, f2only, rowdiffs

let rowsKeyDiff filepath1 filepath2 key =
    rowsKeysDiffExclude filepath1 filepath2 (set [key]) Set.empty

let recordsKeysDiffExclude filepath1 filepath2 (keys:Set<string>) (exclude:Set<string>) =
    let keyvals (record:Map<string,string>) = Set.map (fun c -> record.[c]) keys |> Set.toList
    let load = records >> Seq.map (fun (r:Map<string,string>) -> keyvals r, r) >> Map.ofSeq
    let f1, f2 = load filepath1, load filepath2
    let both, f1only = Map.partition (fun k r -> f2.ContainsKey(k)) f1
    let f2only = Map.filter (fun k r -> not(f1.ContainsKey(k))) f2
    let rec diffrecord (recordA:Map<string,string>) (recordB:Map<string,string>) diffs = function
        | [] -> diffs
        | c :: ctail when exclude.Contains(c) || recordA.[c] = recordB.[c] -> diffrecord recordA recordB diffs ctail
        | c :: ctail -> diffrecord recordA recordB ((keyvals recordA @ [c;recordA.[c];recordB.[c]]) :: diffs) ctail
    let colnames = Map.fold (fun ks k _ -> k :: ks) [] (Map.pick (fun _ v -> Some(v)) both)
    let recorddiffs = Map.toList both |> List.collect (fun (k,v) -> diffrecord v f2.[k] [] colnames)
    f1only, f2only, recorddiffs

let recordsKeyDiff filepath1 filepath2 key =
    recordsKeysDiffExclude filepath1 filepath2 (set [key]) Set.empty