datafest competition 2019
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

76 lines
2.5 KiB

  1. from sklearn import linear_model
  2. import pandas as pd
  3. from sklearn.metrics import mean_squared_error, r2_score
  4. from matplotlib import pyplot as plt
  5. import numpy as np
  6. def k_days_into_future_regression(X, y, k, n0):
  7. """
  8. linear regression that returns the fitted weights as well as metrics
  9. :param X: x timeseries dataframe (very clean, no unamed columns), multidimensional rows
  10. :param y: y timeseries dataframe (very clean, no unamed columns), scalar rows
  11. :param k: days predicting in advance
  12. :param n0: ignoring the first n0 days
  13. :return: intercept, slopes, correlation, mean squared error
  14. """
  15. col = "TimeSinceAugFirst"
  16. inp = []
  17. out = []
  18. for day in y[col][n0 - 1:]:
  19. prev = day - k
  20. xprev = X[X[col] == prev].drop(columns=[col]).to_numpy()
  21. if xprev.shape[0] != 1:
  22. continue
  23. else:
  24. xprev = xprev[0, :]
  25. yt = y[y[col] == day].drop(columns=[col]).to_numpy()[0, :]
  26. inp.append(xprev)
  27. out.append(yt)
  28. regr = linear_model.LinearRegression()
  29. regr.fit(inp, out)
  30. predictions = regr.predict(inp)
  31. mse = mean_squared_error(out, predictions)/(len(out) - 2)
  32. r2 = r2_score(out, predictions)
  33. return regr.intercept_, regr.coef_, r2, mse
  34. def standard_lr(x, y):
  35. x = x.reshape(-1, 1)
  36. y = y.reshape(-1, 1)
  37. regr = linear_model.LinearRegression()
  38. regr.fit(x, y)
  39. predictions = regr.predict(x)
  40. mse = mean_squared_error(y, predictions) / (len(y) - 2)
  41. r2 = r2_score(y, predictions)
  42. return regr.intercept_, regr.coef_, r2, mse
  43. def run_all_linears():
  44. # Reads in the neccessary csv file
  45. df = pd.read_csv('data_preparation/cleaned/time_series_normalized_wellness_menstruation.csv')
  46. regr = linear_model.LinearRegression()
  47. for i in range(4, 11):
  48. for j in range(1, 11 - i):
  49. mat = df[[df.columns[i], df.columns[i + j]]].values
  50. regr.intercept_, regr.coef_, r2, mse = standard_lr(mat[:, 0], mat[:, 1])
  51. plt.figure(figsize=(6, 6))
  52. plt.xlabel(df.columns[i])
  53. plt.ylabel(df.columns[i + j])
  54. plt.title('r2: ' + str(r2))
  55. plt.scatter(mat[:, 0], mat[:, 1])
  56. plt.savefig('wellness_linear_regressions/' + df.columns[i] + '_vs_' + df.columns[i + j] + '.png')
  57. plt.close()
  58. def run_all_polynomials():
  59. # Reads in the neccessary csv file
  60. df = pd.read_csv('data_preparation/cleaned/time_series_normalized_wellness_menstruation.csv')
  61. regr = linear_model.LinearRegression()
  62. for i in range(4, 11):
  63. for j in range(1, 11 - i):
  64. mat = df[[df.columns[i], df.columns[i + j]]].values
  65. run_all_linears()