datafest competition 2019
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

90 lines
3.0 KiB

  1. from sklearn import linear_model
  2. import pandas as pd
  3. from sklearn.metrics import mean_squared_error, r2_score
  4. from matplotlib import pyplot as plt
  5. import numpy as np
  6. from sklearn.preprocessing import PolynomialFeatures
  7. def k_days_into_future_regression(X, y, k, n0):
  8. """
  9. linear regression that returns the fitted weights as well as metrics
  10. :param X: x timeseries dataframe (very clean, no unamed columns), multidimensional rows
  11. :param y: y timeseries dataframe (very clean, no unamed columns), scalar rows
  12. :param k: days predicting in advance
  13. :param n0: ignoring the first n0 days
  14. :return: intercept, slopes, correlation, mean squared error
  15. """
  16. col = "TimeSinceAugFirst"
  17. inp = []
  18. out = []
  19. for day in y[col][n0 - 1:]:
  20. prev = day - k
  21. xprev = X[X[col] == prev].drop(columns=[col]).to_numpy()
  22. if xprev.shape[0] != 1:
  23. continue
  24. else:
  25. xprev = xprev[0, :]
  26. yt = y[y[col] == day].drop(columns=[col]).to_numpy()[0, :]
  27. inp.append(xprev)
  28. out.append(yt)
  29. regr = linear_model.LinearRegression()
  30. regr.fit(inp, out)
  31. predictions = regr.predict(inp)
  32. mse = mean_squared_error(out, predictions)/(len(out) - 2)
  33. r2 = r2_score(out, predictions)
  34. return regr.intercept_, regr.coef_, r2, mse
  35. def standard_lr(x, y):
  36. x = x.reshape(-1, 1)
  37. y = y.reshape(-1, 1)
  38. regr = linear_model.LinearRegression()
  39. regr.fit(x, y)
  40. predictions = regr.predict(x)
  41. mse = mean_squared_error(y, predictions) / (len(y) - 2)
  42. r2 = r2_score(y, predictions)
  43. return regr.intercept_, regr.coef_, r2, mse
  44. def poly_regression(x, y, degree):
  45. # Reshapes the models to be able to run regression on them
  46. x = x.reshape(-1, 1)
  47. y = y.reshape(-1, 1)
  48. # Polynomial regression with nth degree, gives back rmse and r2
  49. polynomial_features = PolynomialFeatures(degree=degree)
  50. x_poly = polynomial_features.fit_transform(x)
  51. model = linear_model.LinearRegression()
  52. model.fit(x_poly, y)
  53. y_poly_pred = model.predict(x_poly)
  54. rmse = np.sqrt(mean_squared_error(y, y_poly_pred))
  55. r2 = r2_score(y, y_poly_pred)
  56. return rmse, r2
  57. def run_all_polynomials():
  58. # Reads in the neccessary csv file
  59. df = pd.read_csv('data_preparation/cleaned/personal.csv')
  60. regr = linear_model.LinearRegression()
  61. print("xVal, yVal, degree, r2, rmse")
  62. for i in range(3, 14):
  63. for j in range(1, 14 - i):
  64. mat = df[[df.columns[i], df.columns[i + j]]].values
  65. for d in range(1, 6):
  66. rmse, r2 = poly_regression(mat[:, 0], mat[:, 1], d)
  67. plt.figure(figsize=(6, 6))
  68. plt.xlabel(df.columns[i])
  69. plt.ylabel(df.columns[i + j])
  70. plt.title('r2: ' + str(r2) + 'degree: ' + str(d))
  71. plt.scatter(mat[:, 0], mat[:, 1])
  72. plt.savefig('personal_regression_info/' + df.columns[i] + '_vs_' + df.columns[i + j] + '_' + str(d) + '_degree.png')
  73. print(df.columns[i] + ', ' + df.columns[i + j] + ', ' + str(d) + ', ' + str(r2) + ', ' + str(rmse))
  74. plt.close()
  75. # run_all_linears()
  76. run_all_polynomials()