datafest competition 2019
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

40 lines
1.4 KiB

  1. from sklearn import linear_model
  2. import pandas as pd
  3. from sklearn.metrics import mean_squared_error, r2_score
  4. def k_days_into_future_regression(X, y, k, n0):
  5. """
  6. linear regression that returns the fitted weights as well as metrics
  7. :param X: x timeseries dataframe (very clean, no unamed columns), multidimensional rows
  8. :param y: y timeseries dataframe (very clean, no unamed columns), scalar rows
  9. :param k: days predicting in advance
  10. :param n0: ignoring the first n0 days
  11. :return: intercept, slopes, correlation, mean squared error
  12. """
  13. col = "TimeSinceAugFirst"
  14. inp = []
  15. out = []
  16. for day in y[col][n0 - 1:]:
  17. prev = day - k
  18. xprev = X[X[col] == prev].drop(columns=[col]).to_numpy()[0, :]
  19. yt = y[y[col] == day].drop(columns=[col]).to_numpy()[0, :]
  20. inp.append(xprev)
  21. out.append(yt)
  22. regr = linear_model.LinearRegression()
  23. regr.fit(inp, out)
  24. predictions = regr.predict(inp)
  25. mse = mean_squared_error(out, predictions)/(len(out) - 2)
  26. r2 = r2_score(out, predictions)
  27. return regr.intercept_, regr.coef_, r2, mse
  28. def main():
  29. fatigueSums = pd.read_csv("fatigue_total_sum.csv")
  30. workMovingAverage21 = pd.read_csv("21DaySlidingWorkAverage.csv", index_col=0)
  31. performance = pd.read_csv("time_series_days_ranked.csv", index_col=0)
  32. print(k_days_into_future_regression(workMovingAverage21, fatigueSums, 0, 21))
  33. if __name__ == "__main__":
  34. main()