From 8f263e859e0970ce87b77addc80dec28e8fc7e82 Mon Sep 17 00:00:00 2001 From: wukong Date: Tue, 5 Jun 2018 22:34:51 -0700 Subject: re-init --- lin_reg2.awk | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 lin_reg2.awk (limited to 'lin_reg2.awk') diff --git a/lin_reg2.awk b/lin_reg2.awk new file mode 100644 index 0000000..a66c78f --- /dev/null +++ b/lin_reg2.awk @@ -0,0 +1,101 @@ +### lin_reg.awk +# simple linear regression between columns + +BEGIN { + sign = "[+-]?" + decimal = "[0-9]+[.]?[0-9]*" + fraction = "[.][0-9]*" + exponent = "([Ee]" sign "[0-9]+)?" + number = "^" sign "(" decimal "|" fraction ")" exponent "$" +} + +NR == 1 { + header_nf = NF + for (n=1; n<=NF; n++) { + if ($n !~ number) + header[n] = $n + else + header[n] = n + } +} + +NF != 0 { + #printf("\n%s: %s", NR, $0) + if (NF > max_nf) + max_nf = NF + + ### iterate over columns + for (y=1; y<= NF; y++) { + if ($y ~ number) { + + ### mean + count[y] += 1 + sum[y] += $y + sum2[y] += $y*$y + mean[y] = sum[y]/count[y] + + ### difference from the mean + delta[y] = $y - mean[y] + sum_delta[y] += delta[y] + sum_delta2[y] += delta[y]*delta[y] + + ### sample variance + if (count[y] - 1) + var[y] = sum_delta2[y]/(count[y] - 1) + else + var[y] = 0 + + # x = row, y = col + for (x=1; x<=max_nf; x++) { + count[x,y] += 1 + sum_xy[x,y] += $x*$y + sum_delta_xy[x,y] += delta[x]*delta[y] + + # correlation + r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y]) + if (r_den[x,y]) + r[x,y] = sum_delta_xy[x,y]/r_den[x,y] + else + r[x,y] = 1 + + ab_den[x,y] = (count[x,y]*sum2[x] - sum[x]*sum[x]) + if (ab_den[x,y]) { + a[x,y] = (sum[y]*sum2[x] - sum[x]*sum_xy[x,y])/ab_den[x,y] + b[x,y] = (count[x,y]*sum_xy[x,y] - sum[x]*sum[y])/ab_den[x,y] + } + else { + a[x,y] = 0 + b[x,y] = 1 + } + + ### error estimate + err_den[x,y] = count[x,y]*(count[x,y] - 2) + if (count[x,y] > 2) { + err[x,y] = $y - (a[x,y] + b[x,y]*$x) + sum_err2[x,y] += err[x,y]*err[x,y] + } + b_err_den[x,y] = (count[x,y] - 2)*sum_delta2[x] + if (b_err_den[x,y]) + b_err[x,y] = sqrt(sum_err2[x,y]/b_err_den[x,y]) + a_err_den[x,y] = count[x,y]*b_err_den[x,y] + if (a_err_den[x,y]) + a_err[x,y] = sqrt(sum2[x]/count[x,y])*b_err[x,y] + } + } + else + continue + } +} + +END { + for (y=1; y<=max_nf; y++) { + for (x=1; x<=max_nf; x++) { + if (x != y && r[x,y]) { + printf("\n %g \t (%s) \t = (%g +/- %g)(%s) \t + (%g +/- %g)", + 10.0*log(r[x,y]*r[x,y])/log(10), header[y], b[x,y], b_err[x,y], header[x], + a[x,y], a_err[x,y]) + } + } + } +} + -- cgit v1.2.3