import pandas as pd
|
|
|
|
|
|
def vectorize_mult(df, column, dictionary):
|
|
"""
|
|
Changes all the categorical values into its respective
|
|
number in the dictionary and then saves it in the DF
|
|
:param df: dataframe
|
|
:param column: column name
|
|
:param dictionary: alterations to make
|
|
"""
|
|
newCol = column + "Num"
|
|
df[newCol] = df[column].map(dictionary)
|
|
|
|
|
|
class WellnessCSV:
|
|
def __init__(self):
|
|
self.file = "data/wellness.csv"
|
|
self.end = "cleaned/notnormalized_with_0NaN_wellness.csv"
|
|
|
|
def vectorize(self):
|
|
df = pd.read_csv(self.file)
|
|
|
|
# Vectorizing appropriate data
|
|
vectorize_mult(df, "Pain", {"No": 0, "Yes": 1})
|
|
vectorize_mult(df, "Illness", {"No": 0, "Slightly Off": 0.5, "Yes": 1})
|
|
vectorize_mult(df, "Menstruation", {"No": 0, "Yes": 1})
|
|
vectorize_mult(df, "Nutrition", {"Poor": 0, "Okay": 0.5, "Excellent": 1})
|
|
vectorize_mult(df, "NutritionAdjustment", {"No": 0, "Yes": 1})
|
|
vectorize_mult(df, "USGMeasurement", {"No": 0, "Yes": 1})
|
|
|
|
readiness = []
|
|
for i, value in df["TrainingReadiness"].iteritems():
|
|
value = value.split("%")[0]
|
|
value = int(value) * (1/100)
|
|
readiness.append(value)
|
|
|
|
df["TrainingReadinessNum"] = readiness
|
|
|
|
# Filling in NaNs for appropriate layers where they won't make a statistical difference
|
|
df["MenstruationNum"] = df["MenstruationNum"].fillna(0)
|
|
df["USGMeasurementNum"] = df["USGMeasurementNum"].fillna(0)
|
|
df["NutritionNum"] = df["MenstruationNum"].fillna(0)
|
|
df["NutritionAdjustmentNum"] = df["NutritionAdjustmentNum"].fillna(0)
|
|
|
|
# Saving the df to the "cleaned" CSV
|
|
df.to_csv(self.end)
|
|
|
|
|
|
class FatigueSum:
|
|
def __init__(self):
|
|
self.file = "cleaned/time_series_normalized_wellness.csv"
|
|
self.end = "cleaned/fatigue_total_sum.csv"
|
|
|
|
def calculate(self):
|
|
df = pd.read_csv(self.file)
|
|
|
|
# get some of the fatigue for a particular date
|
|
diction = dict()
|
|
dates = df["TimeSinceAugFirst"].unique()
|
|
dates = set(dates)
|
|
dates = list(dates)
|
|
|
|
# for each date, get unique data and get calculation
|
|
for date in dates:
|
|
pdf = df[df["TimeSinceAugFirst"] == date]
|
|
num_players = len(pdf["playerID"].unique())
|
|
fatigue_sum = pdf["normFatigue"].sum()
|
|
result = fatigue_sum / num_players
|
|
diction[date] = result
|
|
|
|
# Converting
|
|
dates = diction.keys()
|
|
values = diction.values()
|
|
|
|
final_df = pd.DataFrame()
|
|
final_df["TimeSinceAugFirst"] = dates
|
|
final_df["fatigueSum"] = values
|
|
final_df.to_csv(self.end)
|
|
|