datafest competition 2019
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
2.7 KiB

  1. import pandas as pd
  2. def vectorize_mult(df, column, dictionary):
  3. """
  4. Changes all the categorical values into its respective
  5. number in the dictionary and then saves it in the DF
  6. :param df: dataframe
  7. :param column: column name
  8. :param dictionary: alterations to make
  9. """
  10. newCol = column + "Num"
  11. df[newCol] = df[column].map(dictionary)
  12. class WellnessCSV:
  13. def __init__(self):
  14. self.file = "data/wellness.csv"
  15. self.end = "cleaned/notnormalized_with_0NaN_wellness.csv"
  16. def vectorize(self):
  17. df = pd.read_csv(self.file)
  18. # Vectorizing appropriate data
  19. vectorize_mult(df, "Pain", {"No": 0, "Yes": 1})
  20. vectorize_mult(df, "Illness", {"No": 0, "Slightly Off": 0.5, "Yes": 1})
  21. vectorize_mult(df, "Menstruation", {"No": 0, "Yes": 1})
  22. vectorize_mult(df, "Nutrition", {"Poor": 0, "Okay": 0.5, "Excellent": 1})
  23. vectorize_mult(df, "NutritionAdjustment", {"No": 0, "Yes": 1})
  24. vectorize_mult(df, "USGMeasurement", {"No": 0, "Yes": 1})
  25. readiness = []
  26. for i, value in df["TrainingReadiness"].iteritems():
  27. value = value.split("%")[0]
  28. value = int(value) * (1/100)
  29. readiness.append(value)
  30. df["TrainingReadinessNum"] = readiness
  31. # Filling in NaNs for appropriate layers where they won't make a statistical difference
  32. df["MenstruationNum"] = df["MenstruationNum"].fillna(0)
  33. df["USGMeasurementNum"] = df["USGMeasurementNum"].fillna(0)
  34. df["NutritionNum"] = df["MenstruationNum"].fillna(0)
  35. df["NutritionAdjustmentNum"] = df["NutritionAdjustmentNum"].fillna(0)
  36. # Saving the df to the "cleaned" CSV
  37. df.to_csv(self.end)
  38. class FatigueSum:
  39. def __init__(self):
  40. self.file = "cleaned/time_series_normalized_wellness.csv"
  41. self.end = "cleaned/fatigue_total_sum.csv"
  42. def calculate(self):
  43. df = pd.read_csv(self.file)
  44. # get some of the fatigue for a particular date
  45. diction = dict()
  46. dates = df["TimeSinceAugFirst"].unique()
  47. dates = set(dates)
  48. dates = list(dates)
  49. # for each date, get unique data and get calculation
  50. for date in dates:
  51. pdf = df[df["TimeSinceAugFirst"] == date]
  52. num_players = len(pdf["playerID"].unique())
  53. fatigue_sum = pdf["normFatigue"].sum()
  54. result = fatigue_sum / num_players
  55. diction[date] = result
  56. # Converting
  57. dates = diction.keys()
  58. values = diction.values()
  59. final_df = pd.DataFrame()
  60. final_df["TimeSinceAugFirst"] = dates
  61. final_df["fatigueSum"] = values
  62. final_df.to_csv(self.end)