`using DataFrames, Query, CSV, JLD2, StatsBase, MLLabelUtils, Random, FileIO
include("chp3_functions.jl")
Random.seed!(24908)
# chp3_data_class.jl
## Types for the files columns
IntOrMiss = Union{Int64,Missing}
FltOrMiss = Union{Float64,Missing}
StrOrMiss = Union{String,Missing}
## define variable names for each column
recipe_header = ["beer_id", "name", "url", "style", "style_id", "size",
"og", "fg", "abv", "ibu", "color", "boil_size", "boil_time", "biol_grav",
"efficiency", "mash_thick", "sugar_scale", "brew_method", "pitch_rate",
"pri_temp", "prime_method", "prime_am"]
## dictionary of types for each column
recipe_types2 = Dict{String, Union}(
"beer_id" => IntOrMiss,
"name" => StrOrMiss,
"url" => StrOrMiss,
"style" => StrOrMiss,
"style_id" => IntOrMiss,
"size" => FltOrMiss,
"og" => FltOrMiss,
"fg" => FltOrMiss,
"abv" => FltOrMiss,
"ibu" => FltOrMiss,
"color" => FltOrMiss,
"boil_size" => FltOrMiss,
"boil_time" => FltOrMiss,
"biol_grav" => FltOrMiss,
"efficiency" => FltOrMiss,
"mash_thick" => FltOrMiss,
"sugar_scale" => StrOrMiss,
"brew_method" => StrOrMiss,
"pitch_rate" => FltOrMiss,
"pri_temp" => FltOrMiss,
"prime_method" => StrOrMiss,
"prime_am" => StrOrMiss
)
cd(pwd())
df_recipe_raw = CSV.read("recipeData.csv", DataFrame;
delim = ',' ,
quotechar = '"',
missingstring = "N/A",
datarow = 2,
header = recipe_header,
types = recipe_types2
)
allowmissing!(df_recipe_raw)
## delete! columns DOES NOT WORK
#delete!(df_recipe_raw, [:prime_method, :prime_am, :url])
select!(df_recipe_raw, Not([:prime_method, :prime_am, :url]))
nrows, ncols = size(df_recipe_raw)
## Write the raw data dataframe
JLD2.@save "recipeRaw.jld2" df_recipe_raw
## Create a copy of the DF
df_recipe = deepcopy(df_recipe_raw)
## exclude missing styles
filter!(row -> !ismissing(row[:style]), df_recipe)
nrows, ncols = size(df_recipe)
println("Row size: ",nrows)
#This DOES NOT WORK
#=
## Make beer categories
df_recipe[:y] = map(x ->
occursin(r"ALE"i, x) || occursin(r"IPA"i, x) || occursin(r"Porter"i, x)
|| occursin(r"stout"i, x) ? 0 :
occursin(r"lager"i, x) || occursin(r"pilsner"i, x) || occursin(r"bock"i, x)
|| occursin(r"okto"i, x) ? 1 : 99 ,
df_recipe[:style])
=#
occursin.("Porter", df_recipe.style)
val=(map(x ->
occursin("ALE", x) || occursin("IPA", x) || occursin("Porter", x)
|| occursin("stout", x) ? 0 :
occursin("lager", x) || occursin("pilsner", x) || occursin("bock", x)
|| occursin("okto", x) ? 1 : 99 ,
df_recipe.style))
## Insert new column into DataFrame
insertcols!(df_recipe,ncols+1, :"y" => val)
## remove styles that are not lagers or ales
filter!(row -> row[:y] != 99, df_recipe)
## remove extraneous columns
#delete!(df_recipe, [:beer_id, :name, :style, :style_id])
select!(df_recipe, Not([:beer_id, :name, :style, :style_id]))
## create dummy variables - one-hot-encoding
onehot_encoding!(df_recipe, "brew_method" , trace = true)
onehot_encoding!(df_recipe, "sugar_scale")
#describe(df_recipe, stats=[:eltype, :nmissing])
println(describe(df_recipe, cols=1:ncols))
println("Column size before delete : ",ncol(df_recipe))
#delete!(df_recipe, [:brew_method,:sugar_scale])
select!(df_recipe, Not([:brew_method,:sugar_scale]))
println("Column size after delete : ",ncol(df_recipe))
JLD2.@save "recipe.jld2" df_recipe`