patait / dswj Goto Github PK

View Code? Open in Web Editor NEW

23.0 2.0 17.0 3.71 MB

Official code repository for the book "Data Science with Julia" by McNicholas and Tait

License: GNU General Public License v3.0

Julia 100.00%

dswj's Introduction

dswj

Official code repository for the book "Data Science with Julia" by McNicholas and Tait

All the code in this repository uses the GNU General Public License v3 (GPL-3)

The books official url is: https://www.crcpress.com/Data-Science-with-Julia/McNicholas-Tait/p/book/9781138499980

Directories:

src: contains the source code for chapters 2-7 of the book.
data: data files used in the book.

If you:

discover any bugs with the code, please log an issue.
would like to discuss the book, the code or both, please contribute to the discussions.

The code in the repository will be updated in the first quarter of 2021. The updates will reflect the current state of the package ecosystem and core language (Julia 1.5.x +).

dswj's People

Contributors

Stargazers

Watchers

Forkers

caozq19 jacobxk muhammadhakami virgile-baudrot snowdj lnsongxf startupit69 sfmb-mx ym-han jreakerian cescalantec galexandros aymanaamin standardgalactic jp127266 ekys12 billygareth

dswj's Issues

Chapter 3 functions.jl

Submitted by Bill Doak via email

Suggested fix:

`
using DataFrames, MLLabelUtils

function onehot_encoding!(df::DataFrame, col::String; trace = false)

    tmp = df[!,col]
    lev = map(x -> isa(x, String) ? titlecase(x) : string(x), unique(tmp))
    colname = deepcopy(lev)
    nmiss = sum(convert(Array{Bool}, map(x -> ismissing(x), tmp)))

    if(trace)
        println("lev: $lev")
        println("typeof(colname): $(typeof(colname))")
        println("colname: $colname")
        println("nmiss: $nmiss")
    end

   if(nmiss >0)
       tmp = convert(Array{String,1}, map(x -> ismissing(x) ? "missing" : string(x), tmp))
   end

   if isa(colname, Array{Union{Missing, String},1}) || isa(colname, Array{String,1})
       colname = .*(col, map(x -> replace(x," " => "_"), colname))
   else
       colname = .*("V_", map(x -> string(x), 1:length(colname)))
   end
   
   
   for (i, v) in enumerate(lev)
       tf = convertlabel(LabelEnc.ZeroOne, tmp, LabelEnc.OneVsRest(v))
       df[!,colname[i]] = tf
   end
   
end

Chapter 3 - data_class.jl

Submitted via email by Bill Doak
Suggested solution:

`using DataFrames, Query, CSV, JLD2, StatsBase, MLLabelUtils, Random, FileIO
include("chp3_functions.jl")
Random.seed!(24908)
# chp3_data_class.jl

## Types for the files columns
IntOrMiss = Union{Int64,Missing}
FltOrMiss = Union{Float64,Missing}
StrOrMiss = Union{String,Missing}


## define variable names for each column
recipe_header = ["beer_id", "name", "url", "style", "style_id", "size",
  "og", "fg", "abv", "ibu", "color", "boil_size", "boil_time", "biol_grav",
  "efficiency", "mash_thick", "sugar_scale", "brew_method", "pitch_rate",
  "pri_temp", "prime_method", "prime_am"]

 ## dictionary of types for each column
recipe_types2 = Dict{String, Union}(
  "beer_id" => IntOrMiss,
  "name" => StrOrMiss,
  "url" => StrOrMiss,
  "style" => StrOrMiss,
  "style_id" => IntOrMiss,
  "size" => FltOrMiss,
  "og" => FltOrMiss,
  "fg" => FltOrMiss,
  "abv" => FltOrMiss,
  "ibu" => FltOrMiss,
  "color" => FltOrMiss,
  "boil_size" => FltOrMiss,
  "boil_time" => FltOrMiss,
  "biol_grav" => FltOrMiss,
  "efficiency" => FltOrMiss,
  "mash_thick" => FltOrMiss,
  "sugar_scale" => StrOrMiss,
  "brew_method" => StrOrMiss,
  "pitch_rate" => FltOrMiss,
  "pri_temp" => FltOrMiss,
  "prime_method" => StrOrMiss,
  "prime_am" => StrOrMiss
) 

cd(pwd())
df_recipe_raw = CSV.read("recipeData.csv", DataFrame;
  delim = ',' ,
  quotechar = '"',
  missingstring = "N/A",
  datarow = 2,
  header = recipe_header,
  types = recipe_types2
)
allowmissing!(df_recipe_raw)

 
## delete! columns DOES NOT WORK
#delete!(df_recipe_raw, [:prime_method, :prime_am, :url])
select!(df_recipe_raw, Not([:prime_method, :prime_am, :url]))
nrows, ncols = size(df_recipe_raw)

## Write the raw data dataframe
JLD2.@save "recipeRaw.jld2"  df_recipe_raw

## Create a copy of the DF
df_recipe = deepcopy(df_recipe_raw)

## exclude missing styles
filter!(row -> !ismissing(row[:style]), df_recipe)

nrows, ncols = size(df_recipe)
println("Row size: ",nrows)


#This DOES NOT WORK
#=
## Make beer categories
df_recipe[:y] = map(x ->
occursin(r"ALE"i, x) || occursin(r"IPA"i, x) || occursin(r"Porter"i, x) 
   || occursin(r"stout"i, x) ? 0 :
occursin(r"lager"i, x) || occursin(r"pilsner"i, x) || occursin(r"bock"i, x) 
   || occursin(r"okto"i, x) ? 1 : 99 ,
df_recipe[:style])
=#

occursin.("Porter", df_recipe.style)

val=(map(x ->
occursin("ALE", x) || occursin("IPA", x) || occursin("Porter", x) 
   || occursin("stout", x) ? 0 :
occursin("lager", x) || occursin("pilsner", x) || occursin("bock", x) 
   || occursin("okto", x) ? 1 : 99 ,
df_recipe.style))

## Insert new column into DataFrame 
insertcols!(df_recipe,ncols+1, :"y" => val)

## remove styles that are not lagers or ales
filter!(row -> row[:y] != 99, df_recipe)

## remove extraneous columns
#delete!(df_recipe, [:beer_id, :name, :style, :style_id])
select!(df_recipe, Not([:beer_id, :name, :style, :style_id]))


## create dummy variables - one-hot-encoding
onehot_encoding!(df_recipe, "brew_method" , trace = true)
onehot_encoding!(df_recipe, "sugar_scale")

#describe(df_recipe, stats=[:eltype, :nmissing])
println(describe(df_recipe, cols=1:ncols))


println("Column size before delete : ",ncol(df_recipe))
#delete!(df_recipe, [:brew_method,:sugar_scale])
select!(df_recipe, Not([:brew_method,:sugar_scale]))
println("Column size after delete : ",ncol(df_recipe))

JLD2.@save "recipe.jld2"  df_recipe`

I purchased Data Science with Julia, which looks quite useful for my purposes. On page xvi of the Preface it states that "all of the code for the book is available on Github", and references this URL, but the site still seems empty. Is there somewhere else I should look? Or will the code indeed be here soon? Thanks.

Working1-12.jl

Submitted by Bill Doak via email

Suggested fix:

  `## Initially the DataFrame has N rows and 3 columns
  df1 = DataFrame(
    x1 = rand(Normal(2,1), N),
    x2 = [sample(["High", "Medium", "Low"],
                pweights([0.25,0.45,0.30])) for i=1:N],
    x3 = rand(Pareto(2, 1), N) 
   )
  
   nrows, ncols = size(df1)
   insertcols!(df1, :"y" => 0.0)
  
  for i = 1:N
    if df1[i,:x2] == "High" 
      df1[i,:y] = 4 * df1[i,:x3]
    elseif df1[i,:x2] == "Medium" 
      df1[i,:y] = 2 * df1[i,:x3]
    else  
      df1[i,:y] = 0.5 * df1[i,:x3] 
    end
  end  
  
  
  ReName=rename(df1, [:x1, :x2, :x3, :y].=> [:X1, :X2, :X3, :Y])
  println(first(ReName,5))
  
  ## remove rows where the style column is missing.
  filter!(row -> !ismissing(row), df1)
  
  println(describe(df1))
  
  ## Levels of x2
  gd = groupby(df1, :x2, sort=true)
  ## counts are in column x1 of the dataframe
  println(combine(gd, :x1 => sum, nrow))
  ## counts are in column x3 of the dataframe
  println(combine(gd, :x3 => sum, nrow))
  
  ## median of x3 
  println(combine(gd, :x3 => median, nrow))
  
  ## print the summary stats for x3 in each partition
  println(summarystats(df1[!, "x3"]))
  
  
  #println(summarystats(df1, :x3))
  println(describe(df1))
  
  ## keep the grouping variable X2  and Y
  stat= [length, mean, std, median, mad]
  gd = groupby(df1, :x2, sort=true)
  
  println(combine(gd, :y .=> stat))
  
  
  S=sort(df1, [:x2, :y], rev = (true, false))
  println(first(S,5))
  println(S[29:34, :])
  println(last(S,5))`

patait / dswj Goto Github PK

dswj's Introduction

dswj

dswj's People

Contributors

Stargazers

Watchers

Forkers

dswj's Issues

Chapter 3 functions.jl

Chapter 3 - data_class.jl

Code for the book?

Working1-12.jl

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent