summaryrefslogtreecommitdiff
path: root/lin_reg2.awk
diff options
context:
space:
mode:
Diffstat (limited to 'lin_reg2.awk')
-rw-r--r--lin_reg2.awk101
1 files changed, 101 insertions, 0 deletions
diff --git a/lin_reg2.awk b/lin_reg2.awk
new file mode 100644
index 0000000..a66c78f
--- /dev/null
+++ b/lin_reg2.awk
@@ -0,0 +1,101 @@
+### lin_reg.awk
+# simple linear regression between columns
+
+BEGIN {
+ sign = "[+-]?"
+ decimal = "[0-9]+[.]?[0-9]*"
+ fraction = "[.][0-9]*"
+ exponent = "([Ee]" sign "[0-9]+)?"
+ number = "^" sign "(" decimal "|" fraction ")" exponent "$"
+}
+
+NR == 1 {
+ header_nf = NF
+ for (n=1; n<=NF; n++) {
+ if ($n !~ number)
+ header[n] = $n
+ else
+ header[n] = n
+ }
+}
+
+NF != 0 {
+ #printf("\n%s: %s", NR, $0)
+ if (NF > max_nf)
+ max_nf = NF
+
+ ### iterate over columns
+ for (y=1; y<= NF; y++) {
+ if ($y ~ number) {
+
+ ### mean
+ count[y] += 1
+ sum[y] += $y
+ sum2[y] += $y*$y
+ mean[y] = sum[y]/count[y]
+
+ ### difference from the mean
+ delta[y] = $y - mean[y]
+ sum_delta[y] += delta[y]
+ sum_delta2[y] += delta[y]*delta[y]
+
+ ### sample variance
+ if (count[y] - 1)
+ var[y] = sum_delta2[y]/(count[y] - 1)
+ else
+ var[y] = 0
+
+ # x = row, y = col
+ for (x=1; x<=max_nf; x++) {
+ count[x,y] += 1
+ sum_xy[x,y] += $x*$y
+ sum_delta_xy[x,y] += delta[x]*delta[y]
+
+ # correlation
+ r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y])
+ if (r_den[x,y])
+ r[x,y] = sum_delta_xy[x,y]/r_den[x,y]
+ else
+ r[x,y] = 1
+
+ ab_den[x,y] = (count[x,y]*sum2[x] - sum[x]*sum[x])
+ if (ab_den[x,y]) {
+ a[x,y] = (sum[y]*sum2[x] - sum[x]*sum_xy[x,y])/ab_den[x,y]
+ b[x,y] = (count[x,y]*sum_xy[x,y] - sum[x]*sum[y])/ab_den[x,y]
+ }
+ else {
+ a[x,y] = 0
+ b[x,y] = 1
+ }
+
+ ### error estimate
+ err_den[x,y] = count[x,y]*(count[x,y] - 2)
+ if (count[x,y] > 2) {
+ err[x,y] = $y - (a[x,y] + b[x,y]*$x)
+ sum_err2[x,y] += err[x,y]*err[x,y]
+ }
+ b_err_den[x,y] = (count[x,y] - 2)*sum_delta2[x]
+ if (b_err_den[x,y])
+ b_err[x,y] = sqrt(sum_err2[x,y]/b_err_den[x,y])
+ a_err_den[x,y] = count[x,y]*b_err_den[x,y]
+ if (a_err_den[x,y])
+ a_err[x,y] = sqrt(sum2[x]/count[x,y])*b_err[x,y]
+ }
+ }
+ else
+ continue
+ }
+}
+
+END {
+ for (y=1; y<=max_nf; y++) {
+ for (x=1; x<=max_nf; x++) {
+ if (x != y && r[x,y]) {
+ printf("\n %g \t (%s) \t = (%g +/- %g)(%s) \t + (%g +/- %g)",
+ 10.0*log(r[x,y]*r[x,y])/log(10), header[y], b[x,y], b_err[x,y], header[x],
+ a[x,y], a_err[x,y])
+ }
+ }
+ }
+}
+