| @ -0,0 +1,76 @@ | |||
| from sklearn import linear_model | |||
| import pandas as pd | |||
| from sklearn.metrics import mean_squared_error, r2_score | |||
| from matplotlib import pyplot as plt | |||
| import numpy as np | |||
| def k_days_into_future_regression(X, y, k, n0): | |||
| """ | |||
| linear regression that returns the fitted weights as well as metrics | |||
| :param X: x timeseries dataframe (very clean, no unamed columns), multidimensional rows | |||
| :param y: y timeseries dataframe (very clean, no unamed columns), scalar rows | |||
| :param k: days predicting in advance | |||
| :param n0: ignoring the first n0 days | |||
| :return: intercept, slopes, correlation, mean squared error | |||
| """ | |||
| col = "TimeSinceAugFirst" | |||
| inp = [] | |||
| out = [] | |||
| for day in y[col][n0 - 1:]: | |||
| prev = day - k | |||
| xprev = X[X[col] == prev].drop(columns=[col]).to_numpy() | |||
| if xprev.shape[0] != 1: | |||
| continue | |||
| else: | |||
| xprev = xprev[0, :] | |||
| yt = y[y[col] == day].drop(columns=[col]).to_numpy()[0, :] | |||
| inp.append(xprev) | |||
| out.append(yt) | |||
| regr = linear_model.LinearRegression() | |||
| regr.fit(inp, out) | |||
| predictions = regr.predict(inp) | |||
| mse = mean_squared_error(out, predictions)/(len(out) - 2) | |||
| r2 = r2_score(out, predictions) | |||
| return regr.intercept_, regr.coef_, r2, mse | |||
| def standard_lr(x, y): | |||
| x = x.reshape(-1, 1) | |||
| y = y.reshape(-1, 1) | |||
| regr = linear_model.LinearRegression() | |||
| regr.fit(x, y) | |||
| predictions = regr.predict(x) | |||
| mse = mean_squared_error(y, predictions) / (len(y) - 2) | |||
| r2 = r2_score(y, predictions) | |||
| return regr.intercept_, regr.coef_, r2, mse | |||
| def run_all_linears(): | |||
| # Reads in the neccessary csv file | |||
| df = pd.read_csv('data_preparation/cleaned/time_series_normalized_wellness_menstruation.csv') | |||
| regr = linear_model.LinearRegression() | |||
| for i in range(4, 11): | |||
| for j in range(1, 11 - i): | |||
| mat = df[[df.columns[i], df.columns[i + j]]].values | |||
| regr.intercept_, regr.coef_, r2, mse = standard_lr(mat[:, 0], mat[:, 1]) | |||
| plt.figure(figsize=(6, 6)) | |||
| plt.xlabel(df.columns[i]) | |||
| plt.ylabel(df.columns[i + j]) | |||
| plt.title('r2: ' + str(r2)) | |||
| plt.scatter(mat[:, 0], mat[:, 1]) | |||
| plt.savefig('wellness_linear_regressions/' + df.columns[i] + '_vs_' + df.columns[i + j] + '.png') | |||
| plt.close() | |||
| def run_all_polynomials(): | |||
| # Reads in the neccessary csv file | |||
| df = pd.read_csv('data_preparation/cleaned/time_series_normalized_wellness_menstruation.csv') | |||
| regr = linear_model.LinearRegression() | |||
| for i in range(4, 11): | |||
| for j in range(1, 11 - i): | |||
| mat = df[[df.columns[i], df.columns[i + j]]].values | |||
| run_all_linears() | |||