// Multiple Linear Regression Subset Selection // Martin Sewell < martin@martinsewell.com> // 6 September 2021 // This program selects the subset of independent variables that best predict/explain the dependent variable. // Your data file must have no headers, but can be in comma, space or tab-separated format. // Each row represents a data point. // The first column must be the dependent variable(y), subsequent columns the independent variables(x1, x2, x3, etc.). // To run the program, if your data file is called "data.txt", and you want your results saved in "output.txt", ensure that ss.exe, gslcblasd.dll, gsld.dlland data.txt are all in the current directory, and from the command prompt type: // ss data.txt output.txt // The very last line of the output file gives you the multiple linear regression equation with the optimal subset of independent variables. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include std::string Significance(long double p) { std::string significance = ""; if (p < 0.1) significance = "*"; if (p < 0.05) significance = "**"; if (p < 0.01) significance = "***"; if (p < 0.005) significance = "****"; if (p < 0.001) significance = "*****"; return significance; } int Subset(int x, int s) { int t = (int)pow(2, x-1); int t2 = (int)s / t; return t2 % 2; } int main(int argc, char* argv[]) { std::string inputfilename; inputfilename = argv[1]; // First column must be y, the dependent variable std::ifstream inputfile; inputfile.open(inputfilename); if (inputfile.fail()) { std::cout << "Failed to open " << inputfilename << std::endl; return 0; } std::string outputfilename; outputfilename = argv[2]; std::ofstream outputfile; outputfile.open(outputfilename); std::vector> datarc; std::vector::iterator itd; std::string str; long double d; std::istringstream data_s; long int row = 0; int col = 0; while (!inputfile.eof()) { while (getline(inputfile, str)) { if (!str.empty()) { std::replace(str.begin(), str.end(), ',', ' '); datarc.push_back(std::vector()); std::istringstream data_s(str); while (data_s >> d) { datarc[row].push_back(d); // datarc contains original data if (row == 0) col++; } row++; } } } std::vector rowlengths; for (int i = 0; i < row; i++) rowlengths.push_back(datarc[i].size()); int totaldata = std::accumulate(begin(rowlengths), end(rowlengths), 0, std::plus()); if (totaldata == 0) { std::cout << "No data found." << std::endl; return 0; } if (!(std::adjacent_find(rowlengths.begin(), rowlengths.end(), std::not_equal_to<>()) == rowlengths.end())) { std::cout << "Rows are not all the same length." << std::endl; return 0; } if (col == 1) { std::cout << "Only one column of data found." << std::endl; return 0; } long int numrows = row; int orignumcols = col; int NumOfModels = (int)pow(2, orignumcols - 1)-1; std::vector Priors(NumOfModels); std::vector AICs(NumOfModels); std::vector BICs(NumOfModels); std::vector adjustedBICs(NumOfModels); std::vector MallowsCps(NumOfModels); std::vector RSquares(NumOfModels); std::vector AdjustedRSquares(NumOfModels); std::vector SignificanceFs(NumOfModels); std::vector ResidualSSs(NumOfModels); std::vector< std::vector > subsets(NumOfModels, std::vector(orignumcols)); std::vector< std::vector > allcoefficients(NumOfModels, std::vector(orignumcols)); double sumPriors = 0; double minBIC = 0; for (int s = 1; s <= NumOfModels; s++) { outputfile << "model " << s << std::endl; std::vector< std::vector > datacr(orignumcols, std::vector(numrows)); for (int i = 0; i < orignumcols; ++i) subsets[s-1][i] = i; for (int i = 0; i < orignumcols; ++i) for (int j = 0; j < numrows; ++j) datacr[i][j] = datarc[j][i]; for (int x = orignumcols - 1; x > 0; x--) { if (Subset(x, s) == 0) { datacr.erase(datacr.begin() + x); subsets[s-1].erase(subsets[s-1].begin() + x); allcoefficients[s - 1].erase(allcoefficients[s - 1].begin() + x); } } int numcols = datacr.size(); int numparams = numcols; std::vector> datarc1(numrows, std::vector(numcols)); for (int i = 0; i < numcols; ++i) for (int j = 0; j < numrows; ++j) datarc1[j][i] = datacr[i][j]; long int N = numrows; // number of data points long int n = numrows; // number of data points int P = numcols; // number of independent variables = P-1 long int degreesoffreedom = numrows - numcols; int k = P + 1; // number of estimated parameters in the model outputfile << "number of columns: " << numcols << std::endl; outputfile << "number of rows: " << numrows << std::endl; outputfile << std::endl; gsl_vector* y; gsl_matrix* X; gsl_vector* c; // the coefficients c0, c1, c2, c3 gsl_matrix* cov; // allocate space for the matrices and vectors X = gsl_matrix_alloc(N, P); // this is an input y = gsl_vector_alloc(N); // this is an input c = gsl_vector_alloc(P); // this is an output cov = gsl_matrix_alloc(P, P); // this is an output // put the data into the X matrix, row by row for (int r = 0; r < N; ++r) { gsl_matrix_set(X, r, 0, 1.0); for (int c = 1; c < P; ++c) gsl_matrix_set(X, r, c, datarc1[r][c]); } // fill vector of observed data for (int r = 0; r < N; ++r) { gsl_vector_set(y, r, datacr[0][r]); } double chisq; // allocate temporary work space for gsl gsl_multifit_linear_workspace* work; work = gsl_multifit_linear_alloc(N, P); // fit the model gsl_multifit_linear(X, y, c, cov, &chisq, work); std::vector coefficients(numcols); std::vector standarderror(numcols); std::vector tstat(numcols); std::vector pvalue(numcols); std::vector Lower95(numcols); std::vector Upper95(numcols); gsl_vector_view stderror = gsl_matrix_diagonal(cov); gsl_vector* z = &stderror.vector; int Regressiondf = P - 1; int Residualdf = degreesoffreedom; long int Totaldf = N - 1; long double ResidualSS = chisq; // residual sum of squares = RSS = sum of squared residuals = SSR = sum of squared estimate of errors = SSE long double TotalSS = gsl_stats_tss(y->data, y->stride, y->size); long double RegressionSS = TotalSS - ResidualSS; long double ResidualMS = chisq / degreesoffreedom; // reduced chi-square statistic long double RegressionMS = RegressionSS / Regressiondf; // mean square for (int j = 0; j < numcols; j++) { coefficients[j] = gsl_vector_get(c, j); allcoefficients[s-1][j] = gsl_vector_get(c, j); standarderror[j] = sqrt(gsl_vector_get(z, j)); tstat[j] = coefficients[j] / standarderror[j]; pvalue[j] = 2 * (1 - gsl_cdf_tdist_P(abs(tstat[j]), degreesoffreedom)); Lower95[j] = coefficients[j] - abs(gsl_cdf_tdist_Pinv(0.025, Residualdf)) * standarderror[j]; Upper95[j] = coefficients[j] + abs(gsl_cdf_tdist_Pinv(0.025, Residualdf)) * standarderror[j]; } outputfile << std::setprecision(6) << std::fixed; outputfile << std::setw(14) << " " << std::setw(12) << std::right << "Intercept"; for (int j = 1; j < numcols; j++) { outputfile << std::setw(11) << "x" << subsets[s-1][j]; } outputfile << std::endl; outputfile << std::setw(14) << "Coefficients"; for (int j = 0; j < numcols; j++) { outputfile << std::setw(12) << coefficients[j]; } outputfile << std::endl; outputfile << std::setw(14) << "Standard error"; for (int j = 0; j < numcols; j++) { outputfile << std::setw(12) << standarderror[j]; } outputfile << std::endl; outputfile << std::setw(14) << "t stat"; for (int j = 0; j < numcols; j++) { outputfile << std::setw(12) << tstat[j]; } outputfile << std::endl; outputfile << std::setw(14) << "p-value"; for (int j = 0; j < numcols; j++) { outputfile << std::setw(12) << pvalue[j]; } outputfile << std::endl; outputfile << std::setw(14) << "Lower 95%"; for (int j = 0; j < numcols; j++) { outputfile << std::setw(12) << Lower95[j]; } outputfile << std::endl; outputfile << std::setw(14) << "Upper 95%"; for (int j = 0; j < numcols; j++) { outputfile << std::setw(12) << Upper95[j]; } outputfile << std::endl; outputfile << std::setw(14) << "Significance"; for (int j = 0; j < numcols; j++) { outputfile << std::setw(12) << Significance(pvalue[j]); } outputfile << std::endl; outputfile << std::endl; outputfile << "Statistical significance:" << std::endl; outputfile << "* 10%, ** 5%, *** 1%, **** 0.5%, ***** 0.1%" << std::endl; outputfile << std::endl; long double MultipleR = sqrt(1 - chisq / gsl_stats_tss(y->data, y->stride, y->size)); // coefficient of multiple correlation long double RSquare = 1 - chisq / gsl_stats_tss(y->data, y->stride, y->size); // coefficient of determination = R^2 = r^2 = R squared long double AdjustedRSquare = 1 - (ResidualSS / Residualdf) / (TotalSS / Totaldf); // Adjusted R^2 long double StandardError = sqrt(ResidualMS); // standard error = regression standard error = standard error of the regression = standard error of the equation = estimated standard deviation of errors long int Observations = N; // Number of data points outputfile << std::setprecision(12) << std::fixed; outputfile << "Multiple R " << MultipleR << std::endl; outputfile << "R Square " << RSquare << std::endl; outputfile << "Adjusted R Square " << AdjustedRSquare << std::endl; outputfile << "Standard Error " << StandardError << std::endl; outputfile << "Observations " << Observations << std::endl; outputfile << std::endl; //ANOVA outputfile << "Regression df " << Regressiondf << std::endl; outputfile << "Residual df " << Residualdf << std::endl; outputfile << "Total df " << Totaldf << std::endl; outputfile << std::endl; outputfile << "Regression SS " << RegressionSS << std::endl; outputfile << "Residual SS " << ResidualSS << std::endl; outputfile << "Total SS " << TotalSS << std::endl; outputfile << std::endl; outputfile << "Regression MS " << RegressionMS << std::endl; outputfile << "Residual MS " << ResidualMS << std::endl; outputfile << std::endl; long double RegressionF = (RegressionSS / Regressiondf) / (ResidualSS / degreesoffreedom); outputfile << "Regression F " << RegressionF << std::endl; outputfile << std::endl; long double SignificanceF = 1 - gsl_cdf_fdist_P(abs(RegressionF), Regressiondf, degreesoffreedom); outputfile << "Significance F " << SignificanceF << std::endl; outputfile << "Significance " << std::setw(12) << Significance(SignificanceF); outputfile << std::endl; outputfile << "____________________________________________________________________" << std::endl; long double MSE = ResidualSS / degreesoffreedom; // mean square error = residual mean square long double AIC = 2 * (double)k + (double)n * log(ResidualSS); // long double BIC = (double)n * log(ResidualSS / (double)n) + ((double)k) * log((double)n); // long double MallowsCp = ResidualSS / MSE - (double)n + (2.0) * ((double)P); // long double Prior = pow(5, numparams); Priors[s - 1] = Prior; sumPriors = sumPriors + Prior; if (BIC < minBIC) minBIC = BIC; ResidualSSs[s - 1] = ResidualSS; RSquares[s - 1] = RSquare; AdjustedRSquares[s - 1] = AdjustedRSquare; SignificanceFs[s - 1] = SignificanceF; AICs[s - 1] = AIC; BICs[s - 1] = BIC; MallowsCps[s - 1] = MallowsCp; outputfile << std::endl; outputfile << std::flush; gsl_matrix_free(X); gsl_matrix_free(cov); gsl_vector_free(y); gsl_vector_free(c); gsl_multifit_linear_free(work); // mvs } for (int i = 0; i < NumOfModels; ++i) { adjustedBICs[i] = BICs[i] - minBIC; } std::vector Likelihoods(NumOfModels); std::vector Posteriors(NumOfModels); long double sumLikelihoods = 0; for (int i = 0; i < NumOfModels; ++i) { Likelihoods[i] = exp(-0.5 * adjustedBICs[i]); sumLikelihoods = sumLikelihoods + Likelihoods[i]; } for (int i = 0; i < NumOfModels; ++i) { Priors[i] = Priors[i] / sumPriors; Likelihoods[i] = Likelihoods[i] / sumLikelihoods; } long double sumPosteriors = 0; for (int i = 0; i < NumOfModels; ++i) { Posteriors[i] = Priors[i] * Likelihoods[i]; sumPosteriors = sumPosteriors + Posteriors[i]; } for (int i = 0; i < NumOfModels; ++i) { Posteriors[i] = Posteriors[i]/ sumPosteriors; } outputfile << "Model x ResidualSS RSquare AdjdRSquare SignificanceF Prior Likelihood Posterior AIC BIC MallowsCp" << std::endl; for (int i = 0; i < NumOfModels; ++i) { outputfile << "Model " << i + 1 << " "; for (unsigned int j = 1; j < subsets[i].size(); j++) outputfile << "x" << subsets[i][j] << " "; outputfile << ResidualSSs[i] << " " << RSquares[i] << " " << AdjustedRSquares[i] << " " << SignificanceFs[i] << " " << Priors[i] << " " << Likelihoods[i] << " " << Posteriors[i] << " " << AICs[i] << " " << BICs[i] << " " << MallowsCps[i] << std::endl; } outputfile << std::endl; long double lowestAIC; int bestmodel = -1; for (int i = 0; i < NumOfModels; ++i) { if (i == 0) { lowestAIC = AICs[i]; bestmodel = i + 1; } else { if (AICs[i] < lowestAIC) { lowestAIC = AICs[i]; bestmodel = i + 1; } } } outputfile << "Best model (AIC): " << bestmodel << std::endl; outputfile << std::endl; outputfile << "y = " << allcoefficients[bestmodel - 1][0]; for (unsigned int j = 1; j < subsets[bestmodel - 1].size(); j++) outputfile << " + " << allcoefficients[bestmodel - 1][j] << "x" << subsets[bestmodel - 1][j]; outputfile << std::endl; std::cout << "Finished!" << std::endl; return 0; }