|
@ -1,12 +1,51 @@ |
|
|
import pandas as pd |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
# read in CSV |
|
|
|
|
|
df = pd.read_csv('cleaned/dirty_wellness_na.csv') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def vectorize_mult(column, dictionary, file=None): |
|
|
|
|
|
|
|
|
def vectorize_mult(df, column, dictionary): |
|
|
|
|
|
""" |
|
|
|
|
|
Changes all the categorical values into its respective |
|
|
|
|
|
number in the dictionary and then saves it in the DF |
|
|
|
|
|
:param df: dataframe |
|
|
|
|
|
:param column: column name |
|
|
|
|
|
:param dictionary: alterations to make |
|
|
|
|
|
""" |
|
|
newCol = column + "Num" |
|
|
newCol = column + "Num" |
|
|
df[newCol] = df[column].map(dictionary) |
|
|
df[newCol] = df[column].map(dictionary) |
|
|
if file is not None: |
|
|
|
|
|
df.to_csv('cleaned/{}.csv'.format(file)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WellnessCSV: |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
self.file = "data/wellness.csv" |
|
|
|
|
|
self.end = "cleaned/notnormalized_with_0NaN_wellness.csv" |
|
|
|
|
|
|
|
|
|
|
|
def vectorize(self): |
|
|
|
|
|
df = pd.read_csv(self.file) |
|
|
|
|
|
|
|
|
|
|
|
# Vectorizing appropriate data |
|
|
|
|
|
vectorize_mult(df, "Pain", {"No": 0, "Yes": 1}) |
|
|
|
|
|
vectorize_mult(df, "Illness", {"No": 0, "Slightly Off": 0.5, "Yes": 1}) |
|
|
|
|
|
vectorize_mult(df, "Menstruation", {"No": 0, "Yes": 1}) |
|
|
|
|
|
vectorize_mult(df, "Nutrition", {"Poor": 0, "Okay": 0.5, "Excellent": 1}) |
|
|
|
|
|
vectorize_mult(df, "NutritionAdjustment", {"No": 0, "Yes": 1}) |
|
|
|
|
|
vectorize_mult(df, "USGMeasurement", {"No": 0, "Yes": 1}) |
|
|
|
|
|
|
|
|
|
|
|
readiness = [] |
|
|
|
|
|
for i, value in df["TrainingReadiness"].iteritems(): |
|
|
|
|
|
value = value.split("%")[0] |
|
|
|
|
|
value = int(value) * (1/100) |
|
|
|
|
|
readiness.append(value) |
|
|
|
|
|
|
|
|
|
|
|
df["TrainingReadinessNum"] = readiness |
|
|
|
|
|
|
|
|
|
|
|
# Filling in NaNs for appropriate layers where they won't make a statistical difference |
|
|
|
|
|
df["MenstruationNum"] = df["MenstruationNum"].fillna(0) |
|
|
|
|
|
df["USGMeasurementNum"] = df["USGMeasurementNum"].fillna(0) |
|
|
|
|
|
df["NutritionNum"] = df["MenstruationNum"].fillna(0) |
|
|
|
|
|
df["NutritionAdjustmentNum"] = df["NutritionAdjustmentNum"].fillna(0) |
|
|
|
|
|
|
|
|
|
|
|
# Saving the df to the "cleaned" CSV |
|
|
|
|
|
df.to_csv(self.end) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cls = WellnessCSV() |
|
|
|
|
|
cls.vectorize() |