#!/usr/bin/awk -f ### lin_reg1.awk # simple linear regression between columns BEGIN { OFS = "%.9g" sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" exponent = "([Ee]" sign "[0-9]+)?" number = "^" sign "(" decimal "|" fraction ")" exponent "$" } NR == 1 { header_nf = NF for (n=1; n<=NF; n++) { ($n !~ number) ? header[n] = $n : header[n] = "col" n } } NF != 0 { if (NF > nf_max) nf_max = NF ### iterate over columns for (y=1; y<=nf_max; y++) { if ($y ~ number) { ### mean count[y] += 1 sum[y] += $y sum2[y] += $y*$y mean[y] = sum[y]/count[y] ### difference from the mean delta[y] = $y - mean[y] sum_delta[y] += delta[y] sum_delta2[y] += delta[y]*delta[y] # x = row, y = col for (x=1; x<=nf_max; x++) { count[x,y] += 1 sum_xy[x,y] += $x*$y sum_delta_xy[x,y] += delta[x]*delta[y] r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y]) (r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 0 ab_den[x,y] = (count[x,y]*sum2[x] - sum[x]*sum[x]) if (ab_den[x,y]) { a[x,y] = (sum[y]*sum2[x] - sum[x]*sum_xy[x,y])/ab_den[x,y] b[x,y] = (count[x,y]*sum_xy[x,y] - sum[x]*sum[y])/ab_den[x,y] } else { a[x,y] = 0 b[x,y] = 1 } } } else continue } } END { for (y=1; y<=nf_max; y++) { for (x=1; x<=nf_max; x++) { if (x != y && r[x,y]) { r2[x,y] = r[x,y]*r[x,y] printf("\n %.9g \t (%s) \t = %.9g(%s) \t + %.9g", 10.0*log(r2[x,y])/log(10), header[y], b[x,y], header[x], a[x,y]) } } } }