|  |  | 1 |  | # simplified form of https://github.com/JuliaData/DataFrames.jl/blob/master/src/other/index.jl | 
|  |  | 2 |  | abstract type AbstractIndex end | 
|  |  | 3 |  |  | 
|  |  | 4 |  | struct Index <: AbstractIndex   # an OrderedDict would be nice here... | 
|  | 16 | 5 |  |     lookup::Dict{AbstractString, Int}      # name => names array position | 
|  |  | 6 |  |     names::Vector{AbstractString} | 
|  |  | 7 |  | end | 
|  |  | 8 |  |  | 
|  |  | 9 |  | function Index(names::Array{T, 1}) where T <: AbstractString | 
|  | 8 | 10 |  |     @assert allunique(names) "names must be unique check for $names" | 
|  | 8 | 11 |  |     lookup = Dict{AbstractString, Int}(zip(names, 1:length(names))) | 
|  | 8 | 12 |  |     Index(lookup, names) | 
|  |  | 13 |  | end | 
|  |  | 14 |  |  | 
|  | 0 | 15 |  | Index() = Index(Dict{AbstractString, Int}(), String[]) | 
|  | 0 | 16 |  | Base.length(x::Index) = length(x.names) | 
|  | 6 | 17 |  | Base.names(x::Index) = copy(x.names) | 
|  | 0 | 18 |  | _names(x::Index) = x.names | 
|  | 0 | 19 |  | Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names)) | 
|  | 0 | 20 |  | Base.isequal(x::AbstractIndex, y::AbstractIndex) = _names(x) == _names(y) # it is enough to check names | 
|  | 0 | 21 |  | Base.:(==)(x::AbstractIndex, y::AbstractIndex) = isequal(x, y) | 
|  |  | 22 |  |  | 
|  | 4 | 23 |  | Base.haskey(x::Index, key::AbstractString) = haskey(x.lookup, key) | 
|  | 0 | 24 |  | Base.haskey(x::Index, key::Integer) = 1 <= key <= length(x.names) | 
|  | 0 | 25 |  | Base.haskey(x::Index, key::Bool) = | 
|  |  | 26 |  |     throw(ArgumentError("invalid key: $key of type Bool")) | 
|  | 0 | 27 |  | Base.keys(x::Index) = names(x) | 
|  |  | 28 |  |  | 
|  | 0 | 29 |  | @inline Base.getindex(x::AbstractIndex, idx::Bool) = throw(ArgumentError("invalid index: $idx of type Bool")) | 
|  |  | 30 |  |  | 
|  | 0 | 31 |  | @inline function Base.getindex(x::AbstractIndex, idx::Integer) | 
|  | 0 | 32 |  |     if !(1 <= idx <= length(x)) | 
|  | 0 | 33 |  |         throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $idx")) | 
|  |  | 34 |  |     end | 
|  | 0 | 35 |  |     Int(idx) | 
|  |  | 36 |  | end | 
|  |  | 37 |  |  | 
|  | 0 | 38 |  | @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{Int}) | 
|  | 0 | 39 |  |     isempty(idx) && return idx | 
|  | 0 | 40 |  |     minidx, maxidx = extrema(idx) | 
|  | 0 | 41 |  |     if minidx < 1 | 
|  | 0 | 42 |  |         throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $minidx")) | 
|  |  | 43 |  |     end | 
|  | 0 | 44 |  |     if maxidx > length(x) | 
|  | 0 | 45 |  |         throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $maxidx")) | 
|  |  | 46 |  |     end | 
|  | 0 | 47 |  |     allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) | 
|  | 0 | 48 |  |     idx | 
|  |  | 49 |  | end | 
|  |  | 50 |  |  | 
|  | 0 | 51 |  | @inline function Base.getindex(x::AbstractIndex, idx::AbstractRange{Int}) | 
|  | 0 | 52 |  |     isempty(idx) && return idx | 
|  | 0 | 53 |  |     minidx, maxidx = extrema(idx) | 
|  | 0 | 54 |  |     if minidx < 1 | 
|  | 0 | 55 |  |         throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $minidx")) | 
|  |  | 56 |  |     end | 
|  | 0 | 57 |  |     if maxidx > length(x) | 
|  | 0 | 58 |  |         throw(BoundsError("attempt to access a Index with $(length(x)) columns at index $maxidx")) | 
|  |  | 59 |  |     end | 
|  | 0 | 60 |  |     allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) | 
|  | 0 | 61 |  |     idx | 
|  |  | 62 |  | end | 
|  |  | 63 |  |  | 
|  | 0 | 64 |  | @inline Base.getindex(x::AbstractIndex, idx::AbstractRange{<:Integer}) = getindex(x, collect(Int, idx)) | 
|  | 0 | 65 |  | @inline Base.getindex(x::AbstractIndex, ::Colon) = Base.OneTo(length(x)) | 
|  |  | 66 |  |  | 
|  | 0 | 67 |  | @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer}) | 
|  | 0 | 68 |  |     if any(v -> v isa Bool, idx) | 
|  | 0 | 69 |  |         throw(ArgumentError("Bool values except for AbstractVector{Bool} are not allowed for column indexing")) | 
|  |  | 70 |  |     end | 
|  | 0 | 71 |  |     getindex(x, Vector{Int}(idx)) | 
|  |  | 72 |  | end | 
|  |  | 73 |  |  | 
|  | 0 | 74 |  | @inline Base.getindex(x::AbstractIndex, idx::AbstractRange{Bool}) = getindex(x, collect(idx)) | 
|  |  | 75 |  |  | 
|  | 0 | 76 |  | @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool}) | 
|  | 0 | 77 |  |     length(x) == length(idx) || throw(BoundsError(x, idx)) | 
|  | 0 | 78 |  |     findall(idx) | 
|  |  | 79 |  | end | 
|  |  | 80 |  |  | 
|  |  | 81 |  | # catch all method handling cases when type of idx is not narrowest possible, Any in particular | 
|  | 0 | 82 |  | @inline function Base.getindex(x::AbstractIndex, idxs::AbstractVector) | 
|  | 0 | 83 |  |     isempty(idxs) && return Int[] # special case of empty idxs | 
|  | 0 | 84 |  |     if idxs[1] isa Real | 
|  | 0 | 85 |  |         if !all(v -> v isa Integer && !(v isa Bool), idxs) | 
|  | 0 | 86 |  |             throw(ArgumentError("Only Integer values allowed when indexing by vector of numbers")) | 
|  |  | 87 |  |         end | 
|  | 0 | 88 |  |         return getindex(x, convert(Vector{Int}, idxs)) | 
|  |  | 89 |  |     end | 
|  | 0 | 90 |  |     idxs[1] isa AbstractString && return getindex(x, convert(Vector{AbstractString}, idxs)) | 
|  | 0 | 91 |  |     throw(ArgumentError("idxs[1] has type $(typeof(idxs[1])); "* | 
|  |  | 92 |  |                         "Only Integer or String values allowed when indexing by vector")) | 
|  |  | 93 |  | end | 
|  |  | 94 |  |  | 
|  | 0 | 95 |  | @inline function Base.getindex(x::AbstractIndex, rx::Regex) | 
|  | 0 | 96 |  |     getindex(x, filter(name -> occursin(rx, String(name)), _names(x))) | 
|  |  | 97 |  | end | 
|  |  | 98 |  |  | 
|  |  | 99 |  | """ | 
|  |  | 100 |  |     fuzzymatch(l::Dict, idx::AbstractString) | 
|  |  | 101 |  | # Fuzzy matching rules: | 
|  |  | 102 |  | # 1. ignore case | 
|  |  | 103 |  | # 2. maximum Levenshtein distance is 2 | 
|  |  | 104 |  | # 3. always show matches with 0 difference (wrong case) | 
|  |  | 105 |  | # 4. on top of 3. do not show more than 8 matches in total | 
|  |  | 106 |  | # Returns candidates ordered by (distance, name) pair | 
|  |  | 107 |  | """ | 
|  |  | 108 |  | function fuzzymatch(l::Dict{AbstractString, Int}, idx::AbstractString) | 
|  | 1 | 109 |  |         idxs = uppercase(idx) | 
|  | 1 | 110 |  |         dist = [(REPL.levenshtein(uppercase(x), idxs), x) for x in keys(l)] | 
|  | 1 | 111 |  |         sort!(dist) | 
|  | 1 | 112 |  |         c = [count(x -> x[1] <= i, dist) for i in 0:2] | 
|  | 1 | 113 |  |         maxd = max(0, searchsortedlast(c, 8) - 1) | 
|  | 1 | 114 |  |         [s for (d, s) in dist if d <= maxd] | 
|  |  | 115 |  | end | 
|  |  | 116 |  |  | 
|  |  | 117 |  | @inline function lookupname(l::Dict{AbstractString, Int}, idx::AbstractString) | 
|  | 49 | 118 |  |     i = get(l, idx, nothing) | 
|  | 25 | 119 |  |     if i === nothing | 
|  | 1 | 120 |  |         candidates = fuzzymatch(l, idx) | 
|  | 1 | 121 |  |         if isempty(candidates) | 
|  | 1 | 122 |  |             throw(ArgumentError("column name :$idx not found in the data frame")) | 
|  |  | 123 |  |         end | 
|  | 0 | 124 |  |         candidatesstr = join(string.(':', candidates), ", ", " and ") | 
|  | 0 | 125 |  |         throw(ArgumentError("column name :$idx not found in the data frame; " * | 
|  |  | 126 |  |                             "existing most similar names are: $candidatesstr")) | 
|  |  | 127 |  |     end | 
|  | 24 | 128 |  |     i | 
|  |  | 129 |  | end | 
|  |  | 130 |  |  | 
|  | 49 | 131 |  | @inline Base.getindex(x::Index, idx::AbstractString) = lookupname(x.lookup, idx) | 
|  | 0 | 132 |  | @inline function Base.getindex(x::Index, idx::AbstractVector{AbstractString}) | 
|  | 0 | 133 |  |     allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) | 
|  | 0 | 134 |  |     [lookupname(x.lookup, i) for i in idx] | 
|  |  | 135 |  | end |