| | 1 | | # simplified form of https://github.com/JuliaData/DataFrames.jl/blob/master/src/other/index.jl |
| | 2 | | abstract type AbstractIndex end |
| | 3 | |
|
| | 4 | | struct Index <: AbstractIndex # an OrderedDict would be nice here... |
| 16 | 5 | | lookup::Dict{AbstractString, Int} # name => names array position |
| | 6 | | names::Vector{AbstractString} |
| | 7 | | end |
| | 8 | |
|
| | 9 | | function Index(names::Array{T, 1}) where T <: AbstractString |
| 8 | 10 | | @assert allunique(names) "names must be unique check for $names" |
| 8 | 11 | | lookup = Dict{AbstractString, Int}(zip(names, 1:length(names))) |
| 8 | 12 | | Index(lookup, names) |
| | 13 | | end |
| | 14 | |
|
| 0 | 15 | | Index() = Index(Dict{AbstractString, Int}(), String[]) |
| 0 | 16 | | Base.length(x::Index) = length(x.names) |
| 6 | 17 | | Base.names(x::Index) = copy(x.names) |
| 0 | 18 | | _names(x::Index) = x.names |
| 0 | 19 | | Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names)) |
| 0 | 20 | | Base.isequal(x::AbstractIndex, y::AbstractIndex) = _names(x) == _names(y) # it is enough to check names |
| 0 | 21 | | Base.:(==)(x::AbstractIndex, y::AbstractIndex) = isequal(x, y) |
| | 22 | |
|
| 4 | 23 | | Base.haskey(x::Index, key::AbstractString) = haskey(x.lookup, key) |
| 0 | 24 | | Base.haskey(x::Index, key::Integer) = 1 <= key <= length(x.names) |
| 0 | 25 | | Base.haskey(x::Index, key::Bool) = |
| | 26 | | throw(ArgumentError("invalid key: $key of type Bool")) |
| 0 | 27 | | Base.keys(x::Index) = names(x) |
| | 28 | |
|
| 0 | 29 | | @inline Base.getindex(x::AbstractIndex, idx::Bool) = throw(ArgumentError("invalid index: $idx of type Bool")) |
| | 30 | |
|
| 0 | 31 | | @inline function Base.getindex(x::AbstractIndex, idx::Integer) |
| 0 | 32 | | if !(1 <= idx <= length(x)) |
| 0 | 33 | | throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $idx")) |
| | 34 | | end |
| 0 | 35 | | Int(idx) |
| | 36 | | end |
| | 37 | |
|
| 0 | 38 | | @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{Int}) |
| 0 | 39 | | isempty(idx) && return idx |
| 0 | 40 | | minidx, maxidx = extrema(idx) |
| 0 | 41 | | if minidx < 1 |
| 0 | 42 | | throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $minidx")) |
| | 43 | | end |
| 0 | 44 | | if maxidx > length(x) |
| 0 | 45 | | throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $maxidx")) |
| | 46 | | end |
| 0 | 47 | | allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) |
| 0 | 48 | | idx |
| | 49 | | end |
| | 50 | |
|
| 0 | 51 | | @inline function Base.getindex(x::AbstractIndex, idx::AbstractRange{Int}) |
| 0 | 52 | | isempty(idx) && return idx |
| 0 | 53 | | minidx, maxidx = extrema(idx) |
| 0 | 54 | | if minidx < 1 |
| 0 | 55 | | throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $minidx")) |
| | 56 | | end |
| 0 | 57 | | if maxidx > length(x) |
| 0 | 58 | | throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $maxidx")) |
| | 59 | | end |
| 0 | 60 | | allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) |
| 0 | 61 | | idx |
| | 62 | | end |
| | 63 | |
|
| 0 | 64 | | @inline Base.getindex(x::AbstractIndex, idx::AbstractRange{<:Integer}) = getindex(x, collect(Int, idx)) |
| 0 | 65 | | @inline Base.getindex(x::AbstractIndex, ::Colon) = Base.OneTo(length(x)) |
| | 66 | |
|
| 0 | 67 | | @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer}) |
| 0 | 68 | | if any(v -> v isa Bool, idx) |
| 0 | 69 | | throw(ArgumentError("Bool values except for AbstractVector{Bool} are not allowed for column indexing")) |
| | 70 | | end |
| 0 | 71 | | getindex(x, Vector{Int}(idx)) |
| | 72 | | end |
| | 73 | |
|
| 0 | 74 | | @inline Base.getindex(x::AbstractIndex, idx::AbstractRange{Bool}) = getindex(x, collect(idx)) |
| | 75 | |
|
| 0 | 76 | | @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool}) |
| 0 | 77 | | length(x) == length(idx) || throw(BoundsError(x, idx)) |
| 0 | 78 | | findall(idx) |
| | 79 | | end |
| | 80 | |
|
| | 81 | | # catch all method handling cases when type of idx is not narrowest possible, Any in particular |
| 0 | 82 | | @inline function Base.getindex(x::AbstractIndex, idxs::AbstractVector) |
| 0 | 83 | | isempty(idxs) && return Int[] # special case of empty idxs |
| 0 | 84 | | if idxs[1] isa Real |
| 0 | 85 | | if !all(v -> v isa Integer && !(v isa Bool), idxs) |
| 0 | 86 | | throw(ArgumentError("Only Integer values allowed when indexing by vector of numbers")) |
| | 87 | | end |
| 0 | 88 | | return getindex(x, convert(Vector{Int}, idxs)) |
| | 89 | | end |
| 0 | 90 | | idxs[1] isa AbstractString && return getindex(x, convert(Vector{AbstractString}, idxs)) |
| 0 | 91 | | throw(ArgumentError("idxs[1] has type $(typeof(idxs[1])); "* |
| | 92 | | "Only Integer or String values allowed when indexing by vector")) |
| | 93 | | end |
| | 94 | |
|
| 0 | 95 | | @inline function Base.getindex(x::AbstractIndex, rx::Regex) |
| 0 | 96 | | getindex(x, filter(name -> occursin(rx, String(name)), _names(x))) |
| | 97 | | end |
| | 98 | |
|
| | 99 | | """ |
| | 100 | | fuzzymatch(l::Dict, idx::AbstractString) |
| | 101 | | # Fuzzy matching rules: |
| | 102 | | # 1. ignore case |
| | 103 | | # 2. maximum Levenshtein distance is 2 |
| | 104 | | # 3. always show matches with 0 difference (wrong case) |
| | 105 | | # 4. on top of 3. do not show more than 8 matches in total |
| | 106 | | # Returns candidates ordered by (distance, name) pair |
| | 107 | | """ |
| | 108 | | function fuzzymatch(l::Dict{AbstractString, Int}, idx::AbstractString) |
| 1 | 109 | | idxs = uppercase(idx) |
| 1 | 110 | | dist = [(REPL.levenshtein(uppercase(x), idxs), x) for x in keys(l)] |
| 1 | 111 | | sort!(dist) |
| 1 | 112 | | c = [count(x -> x[1] <= i, dist) for i in 0:2] |
| 1 | 113 | | maxd = max(0, searchsortedlast(c, 8) - 1) |
| 1 | 114 | | [s for (d, s) in dist if d <= maxd] |
| | 115 | | end |
| | 116 | |
|
| | 117 | | @inline function lookupname(l::Dict{AbstractString, Int}, idx::AbstractString) |
| 49 | 118 | | i = get(l, idx, nothing) |
| 25 | 119 | | if i === nothing |
| 1 | 120 | | candidates = fuzzymatch(l, idx) |
| 1 | 121 | | if isempty(candidates) |
| 1 | 122 | | throw(ArgumentError("column name :$idx not found in the data frame")) |
| | 123 | | end |
| 0 | 124 | | candidatesstr = join(string.(':', candidates), ", ", " and ") |
| 0 | 125 | | throw(ArgumentError("column name :$idx not found in the data frame; " * |
| | 126 | | "existing most similar names are: $candidatesstr")) |
| | 127 | | end |
| 24 | 128 | | i |
| | 129 | | end |
| | 130 | |
|
| 49 | 131 | | @inline Base.getindex(x::Index, idx::AbstractString) = lookupname(x.lookup, idx) |
| 0 | 132 | | @inline function Base.getindex(x::Index, idx::AbstractVector{AbstractString}) |
| 0 | 133 | | allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) |
| 0 | 134 | | [lookupname(x.lookup, i) for i in idx] |
| | 135 | | end |