Extract a wealth of information from lists
MIT License
Extract a wealth of information from lists.
Goldmine is especially helpful when working with source data that is difficult to query. e.g. CSV files, API results, etc...
gem install goldmine
require "goldmine"
list = [1,2,3,4,5,6,7,8,9]
Goldmine(list)
.pivot("< 5") { |i| i < 5 }
.to_h
{
[["< 5", true]] => [1, 2, 3, 4],
[["< 5", false]] => [5, 6, 7, 8, 9]
}
users = [
{ :name => "Sally", :favorite_colors => [:blue] },
{ :name => "John", :favorite_colors => [:blue, :green] },
{ :name => "Stephen", :favorite_colors => [:red, :pink, :purple] },
{ :name => "Emily", :favorite_colors => [:orange, :green] },
{ :name => "Joe", :favorite_colors => [:red] }
]
Goldmine(users)
.pivot(:favorite_color) { |record| record[:favorite_colors] }
.to_h
{
[:favorite_color, :blue] => [{:name=>"Sally", :favorite_colors=>[:blue]}, {:name=>"John", :favorite_colors=>[:blue, :green]}],
[:favorite_color, :green] => [{:name=>"John", :favorite_colors=>[:blue, :green]}, {:name=>"Emily", :favorite_colors=>[:orange, :green]}],
[:favorite_color, :red] => [{:name=>"Stephen", :favorite_colors=>[:red, :pink, :purple]}, {:name=>"Joe", :favorite_colors=>[:red]}],
[:favorite_color, :pink] => [{:name=>"Stephen", :favorite_colors=>[:red, :pink, :purple]}],
[:favorite_color, :purple] => [{:name=>"Stephen", :favorite_colors=>[:red, :pink, :purple]}],
[:favorite_color, :orange] => [{:name=>"Emily", :favorite_colors=>[:orange, :green]}]
}
users = [
{ :name => "Sally", :age => 21 },
{ :name => "John", :age => 28 },
{ :name => "Stephen", :age => 37 },
{ :name => "Emily", :age => 32 },
{ :name => "Joe", :age => 18 }
]
Goldmine(users)
.pivot("'e' in name") { |user| !!user[:name].match(/e/i) }
.pivot("21 or over") { |user| user[:age] >= 21 }
.to_h
{
[["'e' in name", false], ["21 or over", true]] => [{:name=>"Sally", :age=>21}, {:name=>"John", :age=>28}],
[["'e' in name", true], ["21 or over", true]] => [{:name=>"Stephen", :age=>37}, {:name=>"Emily", :age=>32}],
[["'e' in name", true], ["21 or over", false]] => [{:name=>"Joe", :age=>18}]
}
Rollups provide an intuitive way to aggregate pivoted data into a report friendly format. Think computed columns.
Rollups are blocks
that get executed once for each pivot entry.
They can be also be chained.
list = [1,2,3,4,5,6,7,8,9]
Goldmine(list)
.pivot("< 5") { |i| i < 5 }
.pivot("even") { |i| i % 2 == 0 }
.rollup("count", &:count)
.to_h
{
[["< 5", true], ["even", false]] => [["count", 2]],
[["< 5", true], ["even", true]] => [["count", 2]],
[["< 5", false], ["even", false]] => [["count", 3]],
[["< 5", false], ["even", true]] => [["count", 2]]
}
Rollups can be computationally expensive. Optional caching can be used to reduce this computational overhead.
list = [1,2,3,4,5,6,7,8,9]
Goldmine(list, cache: true)
.pivot(:less_than_5) { |i| i < 5 }
.rollup(:count, &:count)
.rollup(:evens) { |list| list.select { |i| i % 2 == 0 }.count }
.rollup(:even_percentage) { |list| cache[:evens] / cache[:count].to_f }
.to_h
{
[[:less_than_5, true]] => [[:count, 4], [:evens, 2], [:even_percentage, 0.5]],
[[:less_than_5, false]] => [[:count, 5], [:evens, 2], [:even_percentage, 0.4]]
}
It's often helpful to flatten rollups into rows.
list = [1,2,3,4,5,6,7,8,9]
result = Goldmine(list, cache: true)
.pivot(:less_than_5) { |i| i < 5 }
.rollup(:count, &:count)
.rollup(:evens) { |list| list.select { |i| i % 2 == 0 }.count }
.rollup(:even_percentage) { |list| cache[:evens] / cache[:count].to_f }
.result
result.to_rows
[
[[:less_than_5, true], [:count, 4], [:evens, 2], [:even_percentage, 0.5]],
[[:less_than_5, false], [:count, 5], [:evens, 2], [:even_percentage, 0.4]]
]
result.to_hash_rows
[
{:less_than_5=>true, :count=>4, :evens=>2, :even_percentage=>0.5},
{:less_than_5=>false, :count=>5, :evens=>2, :even_percentage=>0.4}
]
Rollups can also be converted into tabular format.
list = [1,2,3,4,5,6,7,8,9]
Goldmine(list)
.pivot(:less_than_5) { |i| i < 5 }
.pivot(:even) { |i| i % 2 == 0 }
.rollup(:count, &:count)
.to_tabular
[
[:less_than_5, :even, :count],
[true, false, 2],
[true, true, 2],
[false, false, 3],
[false, true, 2]
]
Rollups can also be converted into CSV format.
list = [1,2,3,4,5,6,7,8,9]
Goldmine(list)
.pivot(:less_than_5) { |i| i < 5 }
.pivot(:even) { |i| i % 2 == 0 }
.rollup(:count, &:count)
.to_csv
"less_than_5,even,count\ntrue,false,2\ntrue,true,2\nfalse,false,3\nfalse,true,2\n"
All examples are small Sinatra apps. They are designed to help communicate Goldmine use-cases.
git clone [email protected]:hopsoft/goldmine.git
cd /path/to/goldmine
bundle
Uses data from https://github.com/hopsoft/goldmine/blob/master/examples/new_york_wifi_hotspots/DOITT_WIFI_HOTSPOT_01_13SEPT2010.csv
In this example, we mine out the following information.
ruby examples/new_york_wifi_hotspots/app.rb
curl http://localhost:3000/raw
curl http://localhost:3000/pivoted
curl http://localhost:3000/rolled_up
curl http://localhost:3000/rows
curl http://localhost:3000/tabular
curl http://localhost:3000/csv
Uses data from http://dev.socrata.com/foundry/#/data.medicare.gov/aeay-dfax
In this example, we mine out the following information.
ruby examples/medicare_physician_compare/app.rb
curl http://localhost:3000/raw
curl http://localhost:3000/pivoted
curl http://localhost:3000/rolled_up
curl http://localhost:3000/rows
curl http://localhost:3000/tabular
curl http://localhost:3000/csv
The Medicare dataset is large & works well for performance testing.
My Macbook Pro yields the following benchmarks.
user system total real
pivoted 0.630000 0.030000 0.660000 ( 0.670409)
rolled_up 0.570000 0.030000 0.600000 ( 0.626413)
rows 0.010000 0.000000 0.010000 ( 0.003258)
tabular 0.010000 0.000000 0.010000 ( 0.010110)
csv 0.050000 0.000000 0.050000 ( 0.057677)
user system total real
pivoted 7.270000 0.300000 7.570000 ( 8.053166)
rolled_up 6.800000 0.830000 7.630000 ( 8.051707)
rows 0.000000 0.000000 0.000000 ( 0.003934)
tabular 0.010000 0.000000 0.010000 ( 0.011825)
csv 0.210000 0.010000 0.220000 ( 0.222752)
Goldmine makes data highly malleable. It allows you to combine the power of pivots, rollups, tabular data, & csv to construct deep insights with minimal effort.
Real world use cases include: