DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world
Dsv.fs
// library for working with delimiter-separated values files
module Dsv
open System
open System.IO
// An F# mutually-tail-recursive CSV record parser.
// See the spec at http://creativyst.com/Doc/Articles/CSV/CSV01.htm
let rec csvrecord sep (tr:#TextReader) record (line:string) i =
if i = line.Length then record @ [""]
else
match line.[i] with
| '"' -> csvfield sep tr record "" line (i+1)
| ' ' | '\t' -> csvrecord sep tr record line (i+1)
| c when c = sep -> csvrecord sep tr (record @ [""]) line (i+1)
| '=' when line.[i+1] = '"' -> csvfield sep tr record "" line (i+2) // Excel compatibility
| _ -> // unquoted field data
let fs = line.IndexOf(sep,i)
if fs = -1 then record @ [line.Substring(i).TrimEnd()]
else
csvrecord sep tr (record @ [line.Substring(i,fs-i).TrimEnd()]) line (fs+1)
and csvfield sep (tr:#TextReader) record field (line:string) i =
if i = line.Length then csvfield sep tr record (field+"\n") (tr.ReadLine()) 0
elif line.[i] <> '"' then
let q = line.IndexOf('"',i)
if q = -1 then csvfield sep tr record (field+line.Substring(i)+"\n") (tr.ReadLine()) 0
else csvfield sep tr record (field+line.Substring(i,q-i)) line q
elif i = line.Length-1 then record @ [field]
elif line.[i+1] = '"' then csvfield sep tr record (field+"\"") line (i+2)
elif line.[i+1] = sep then csvrecord sep tr (record @ [field]) line (i+2)
else // not an escaped quote and not end of field; try to recover by appending trimmed unquoted field data
let fs = line.IndexOf(sep,i+1)
if fs = -1 then record @ [field+line.Substring(i).TrimEnd()]
else csvrecord sep tr (record @ [field+line.Substring(i,fs-i).TrimEnd()]) line (fs+1)
let csvrows sep (filepath:string) = seq {
use sr = new StreamReader(filepath)
while not sr.EndOfStream do
yield csvrecord sep sr [] (sr.ReadLine()) 0
}
let csvrecords sep (filepath:string) = seq {
let lists = csvrows sep filepath
let headers = Seq.head lists
for vals in (Seq.skip 1 lists) do
yield List.zip headers vals |> Map.ofList
}
let tsvrows (sep:char) (filepath:string) = seq {
use sr = new StreamReader(filepath)
while not sr.EndOfStream do
yield sr.ReadLine().Split(sep) |> List.ofArray
}
let rows (filepath:string) =
let filename = Path.GetFileName(filepath)
match Path.GetExtension(filename).ToLower() with
| ".csv" -> csvrows ',' filepath
| ".tsv" | ".tab" -> tsvrows '\t' filepath
| ".skv" -> csvrows ';' filepath
| ".psv" -> tsvrows '|' filepath
| ".log" when filename.StartsWith("ex") || filename.StartsWith("u_ex") -> tsvrows ' ' filepath
| _ ->
use sr = new StreamReader(filepath)
let line = sr.ReadLine()
let sep = ['\t';'|';';';','] |> List.sortBy (fun c -> -line.Split(c).Length) |> List.head
if sep = ',' || sep = ';' then
csvrows sep filepath
else
tsvrows sep filepath
let headersFrom (filepath:string) =
Seq.head (rows filepath)
let records (filepath:string) = seq {
let lists = rows filepath
let headers = Seq.head lists
for vals in (Seq.skip 1 lists) do
yield List.zip headers vals |> Map.ofList
}
let listOfFileLines filepath =
File.ReadAllLines(filepath) |> List.ofArray
let setOfFileLines filepath =
File.ReadAllLines(filepath) |> Set.ofArray
let listOfColnum filepath colnum =
rows filepath |> Seq.map (fun x -> x.[colnum]) |> List.ofSeq
let setOfColnum filepath colnum =
rows filepath |> Seq.map (fun x -> x.[colnum]) |> Set.ofSeq
let listOfColumn filepath column =
records filepath |> Seq.map (fun x -> x.[column]) |> List.ofSeq
let setOfColumn filepath column =
records filepath |> Seq.map (fun x -> x.[column]) |> Set.ofSeq
let mapOfColnums filepath keycolnum valcolnum =
rows filepath |> Seq.map (fun x -> (x.[keycolnum],x.[valcolnum])) |> Map.ofSeq
let mapOfColumns filepath keycol valcol =
records filepath |> Seq.map (fun x -> (x.[keycol],x.[valcol])) |> Map.ofSeq
let countOfColnum filepath colnum =
listOfColnum filepath colnum |> Seq.countBy (fun x -> x) |> Seq.sortBy (fun (v,c) -> -c)
let countOfColumn filepath column =
listOfColumn filepath column |> Seq.countBy (fun x -> x) |> Seq.sortBy (fun (v,c) -> -c)
let showCountOfColnum filepath colnum =
listOfColnum filepath colnum |> Seq.countBy (fun x -> x) |> Seq.sortBy (fun (v,c) -> -c) |> Seq.iter (fun (v,c) -> printfn "%5d %s" c v)
let showCountOfColumn filepath column =
listOfColumn filepath column |> Seq.countBy (fun x -> x) |> Seq.sortBy (fun (v,c) -> -c) |> Seq.iter (fun (v,c) -> printfn "%5d %s" c v)
let showRowsAsList filepath =
let rs = rows filepath |> Seq.take 3 |> Seq.toList
let fmtfld (f:string) = f.Replace('n',' ').PadRight(20).Substring(0,20)
List.iteri (fun i _ -> printfn "%s" (List.fold (fun c (r:string list) -> c + " " + (fmtfld r.[i])) ((string i).PadLeft(3)) rs)) rs.[0]
let writeSet filepath set =
File.WriteAllLines(filepath,(Set.toArray set))
let writeRow sep (tw:#TextWriter) record =
let separator = sep.ToString()
tw.WriteLine(List.reduce (fun (x:string) y -> x + separator + y) record)
let writeCsvRows sep (tw:#TextWriter) record =
let quotefield sep (field:string) =
if field.IndexOfAny([|sep;'"';'n'|]) <> -1 || field <> field.Trim() then
"\"" + field.Replace("\"","\"""""\"") + "\""
else
field
writeRow sep tw (List.map (quotefield sep) record)
let writeXlRow (tw:#TextWriter) record =
let xlquotefield (field:string) =
if field <> field.Trim() then
"="" + field.Replace("\"","\"""""\"") + "\""
elif Array.forall (fun c -> Char.IsDigit(c)) (field.ToCharArray()) then
if field.Length > 15 || field.StartsWith("0") then "="" + field + "\""
else field
elif field.IndexOfAny([|',';'"';'n'|]) <> -1 then
"\"" + field.Replace("\"","\"""""\"") + "\""
else
field
writeRow ',' tw (List.map xlquotefield record)
let writeRows (filepath:string) lists =
use sw = new StreamWriter(filepath)
if filepath.EndsWith(".xl.csv") then
Seq.iter (writeXlRow sw) lists
else
match Path.GetExtension(filepath) with
| ".csv" -> Seq.iter (writeCsvRows ',' sw) lists
| ".skv" -> Seq.iter (writeCsvRows ';' sw) lists
| ".psv" -> Seq.iter (writeRow '|' sw) lists
| ".tsv" | ".tab" | _ -> Seq.iter (writeRow '\t' sw) lists
let writeRecords (filepath:string) headers maps =
let recordvals (map:Map<string,string>) =
List.map (fun header -> map.[header]) headers
writeRows filepath (Seq.append (seq[headers]) (Seq.map recordvals maps))
let writeMapJoin (filepath:string) (keyfield:string) (newcolname:string) (map:Map<string,string>) =
let extprefix f p = Path.ChangeExtension(f,p+Path.GetExtension(f))
let joins (record:Map<string,string>) = map.ContainsKey(record.[keyfield])
let joined, unjoined = records filepath |> List.ofSeq |> List.partition joins
let join (record:Map<string,string>) = record.Add(newcolname,map.[record.[keyfield]])
List.map join joined |> writeRecords (extprefix filepath newcolname) ((headersFrom filepath) @ [newcolname])
writeRecords (extprefix filepath "unmatched") (headersFrom filepath) unjoined
let writeFileJoin filepath keyfield mapfile keycol valcol =
writeMapJoin filepath keyfield valcol (mapOfColumns mapfile keycol valcol)
let rowsKeysDiffExclude filepath1 filepath2 (keys:Set<int>) (exclude:Set<int>) =
let keyvals (row:string list) = Set.map (fun i -> row.[i]) keys |> Set.toList
let load = rows >> Seq.map (fun (r:string list) -> keyvals r, r) >> Map.ofSeq
let f1, f2 = load filepath1, load filepath2
let both, f1only = Map.partition (fun k r -> f2.ContainsKey(k)) f1
let f2only = Map.filter (fun k r -> not(f1.ContainsKey(k))) f2
let rec diffrow (rowA:string list) (rowB:string list) i diffs =
if i < rowA.Length then
if rowA.[i] <> rowB.[i] && not(exclude.Contains(i)) then
diffrow rowA rowB (i+1) ((keyvals rowA @ [string (i+1);rowA.[i];rowB.[i]]) :: diffs)
else
diffrow rowA rowB (i+1) diffs
else
diffs
let rowdiffs = Map.toList both |> List.collect (fun (k,v) -> diffrow v f2.[k] 0 [])
f1only, f2only, rowdiffs
let rowsKeyDiff filepath1 filepath2 key =
rowsKeysDiffExclude filepath1 filepath2 (set [key]) Set.empty
let recordsKeysDiffExclude filepath1 filepath2 (keys:Set<string>) (exclude:Set<string>) =
let keyvals (record:Map<string,string>) = Set.map (fun c -> record.[c]) keys |> Set.toList
let load = records >> Seq.map (fun (r:Map<string,string>) -> keyvals r, r) >> Map.ofSeq
let f1, f2 = load filepath1, load filepath2
let both, f1only = Map.partition (fun k r -> f2.ContainsKey(k)) f1
let f2only = Map.filter (fun k r -> not(f1.ContainsKey(k))) f2
let rec diffrecord (recordA:Map<string,string>) (recordB:Map<string,string>) diffs = function
| [] -> diffs
| c :: ctail when exclude.Contains(c) || recordA.[c] = recordB.[c] -> diffrecord recordA recordB diffs ctail
| c :: ctail -> diffrecord recordA recordB ((keyvals recordA @ [c;recordA.[c];recordB.[c]]) :: diffs) ctail
let colnames = Map.fold (fun ks k _ -> k :: ks) [] (Map.pick (fun _ v -> Some(v)) both)
let recorddiffs = Map.toList both |> List.collect (fun (k,v) -> diffrecord v f2.[k] [] colnames)
f1only, f2only, recorddiffs
let recordsKeyDiff filepath1 filepath2 key =
recordsKeysDiffExclude filepath1 filepath2 (set [key]) Set.empty





