datafest competition 2019
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

43 lines
1.4 KiB

  1. from sklearn import linear_model
  2. import pandas as pd
  3. from sklearn.metrics import mean_squared_error, r2_score
  4. def k_days_into_future_regression(X, y, k, n0):
  5. """
  6. linear regression that returns the fitted weights as well as metrics
  7. :param X: x timeseries dataframe (very clean, no unamed columns), multidimensional rows
  8. :param y: y timeseries dataframe (very clean, no unamed columns), scalar rows
  9. :param k: days predicting in advance
  10. :param n0: ignoring the first n0 days
  11. :return: intercept, slopes, correlation, mean squared error
  12. """
  13. col = "TimeSinceAugFirst"
  14. inp = []
  15. out = []
  16. for day in y[col][n0 - 1:]:
  17. prev = day - k
  18. xprev = X[X[col] == prev].drop(columns=[col]).to_numpy()
  19. if xprev.shape[0] != 1:
  20. continue
  21. else:
  22. xprev = xprev[0, :]
  23. yt = y[y[col] == day].drop(columns=[col]).to_numpy()[0, :]
  24. inp.append(xprev)
  25. out.append(yt)
  26. regr = linear_model.LinearRegression()
  27. regr.fit(inp, out)
  28. predictions = regr.predict(inp)
  29. mse = mean_squared_error(out, predictions)/(len(out) - 2)
  30. r2 = r2_score(out, predictions)
  31. return regr.intercept_, regr.coef_, r2, mse
  32. def main():
  33. fatigueSums = pd.read_csv("fatigue_total_sum.csv")
  34. performance = pd.read_csv("../data_preparation/cleaned/expSmoothWorkAndFatigueData.csv", index_col=0).drop(columns=["totalWork", "averageWorkLoad", "smoothedFatigueData"])
  35. print(k_days_into_future_regression(fatigueSums, performance, 0, 1))
  36. if __name__ == "__main__":
  37. main()