diff options
Diffstat (limited to '')
| -rw-r--r-- | cov.awk | 61 | ||||
| -rw-r--r-- | lin_reg.awk | 34 | ||||
| -rw-r--r--[-rwxr-xr-x] | lin_reg1.awk | 11 | ||||
| -rw-r--r-- | lin_reg2.awk | 12 | ||||
| -rw-r--r-- | mean.awk | 3 | ||||
| -rw-r--r-- | mean_avg.awk | 27 |
6 files changed, 102 insertions, 46 deletions
@@ -0,0 +1,61 @@ +#!/usr/bin/awk -f + +### cov.awk +# online covariance algorithm + +BEGIN { + OFMT = "%.18g" + sign = "[+-]?" + decimal = "[0-9]+[.]?[0-9]*" + fraction = "[.][0-9]*" + exponent = "([Ee]" sign "[0-9]+)?" + number = "^" sign "(" decimal "|" fraction ")" exponent "$" +} + +NR == 1 { + for (y=1; y<=NF; y++) + ($y ~ number) ? header[y] = "col" y : header[y] = $y + printf(header[y]) +} + +NF > 0 { + if (NF > nf_max) + nf_max = NF + + ### columns + for (y=1; y<=nf_max; y++) { + if ($y == header[y]) + continue + ### rows + for (x=1; x<=nf_max; x++) { + count[x,y]++ + dx[x,y] = $x - meanx[x,y] + meanx[x,y] += dx[x,y]/count[x,y] + meany[x,y] += ($y - meany[x,y])/count[x,y] + C[x,y] += dx[x,y]*($y - meany[x,y]) + cov_pop[x,y] = C[x,y]/count[x,y] + (count[x,y] > 1) ? cov_samp[x,y] = C[x,y]/(count[x,y] - 1) : cov_samp[x,y] = "" + } + } +} + +END { + ### column headers + printf("cov") + for (y=1; y<=nf_max; y++) { + printf(OFS header[y]) + } + printf(ORS) + + ### columns + for (y=1; y<=nf_max; y++) { + printf(header[y] OFS) + ### rows + for (x=1; x<=nf_max; x++) { + printf("%.18g", cov_samp[x,y]) + if (x < nf_max) + printf(OFS) + } + printf(ORS) + } +} diff --git a/lin_reg.awk b/lin_reg.awk index 8622289..56114a2 100644 --- a/lin_reg.awk +++ b/lin_reg.awk @@ -4,7 +4,7 @@ # simple linear regression between columns BEGIN { - OFS = "%.18g" + OFMT = "%.18g" sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" @@ -13,13 +13,11 @@ BEGIN { } NR == 1 { - header_nf = NF - for (n=1; n<=NF; n++) { - ($n !~ number) ? header[n] = $n : header[n] = "col" n - } + for (n=1; n<=NF; n++) + ($n ~ number) ? header[n] = "col" n : header[n] = $n } -NF != 0 { +NF > 0 { if (NF > nf_max) nf_max = NF @@ -31,15 +29,14 @@ NF != 0 { count[y] += 1 sum[y] += $y sum2[y] += $y*$y - mean[y] = sum[y]/count[y] - - ### difference from the mean - delta[y] = $y - mean[y] - sum_delta[y] += delta[y] - sum_delta2[y] += delta[y]*delta[y] + delta0[y] = $y - mean[y] + mean[y] = mean[y] + delta0[y]/count[y] + delta1[y] = $y - mean[y] + sum_delta[y] += delta1[y] + sum_delta2[y] += delta0[y]*delta1[y] ### sample variance - (count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = 0 + (count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = "" # x = row, y = col for (x=1; x<=nf_max; x++) { @@ -47,9 +44,12 @@ NF != 0 { sum_xy[x,y] += $x*$y sum_delta_xy[x,y] += delta[x]*delta[y] + # covariance + #(count[x,y] > 1) ? cov[x,y] = sum_delta_xy[x,y]/(count[x,y] - 1) : cov[x,y] = "" + # correlation r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y]) - (r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 1 + (r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 1 ab_den[x,y] = (count[x,y]*sum2[x] - sum[x]*sum[x]) if (ab_den[x,y]) { @@ -84,9 +84,9 @@ END { for (y=1; y<=nf_max; y++) { for (x=1; x<=nf_max; x++) { if (x != y && r[x,y]) { - printf("\n %.18g \t (%s) \t = (%.18g +/- %.18g)(%s) \t + (%.18g +/- %.18g)", - 10.0*log(r[x,y]*r[x,y])/log(10), header[y], b[x,y], b_err[x,y], header[x], - a[x,y], a_err[x,y]) + printf("%.9g "OFS" (%s) "OFS" = (%.9g +/- %.9g)(%s) "OFS" + (%.9g +/- %.9g)%s", + 10.0*log(r[x,y]*r[x,y])/log(10.0), header[y], b[x,y], + b_err[x,y], header[x], a[x,y], a_err[x,y], ORS) } } } diff --git a/lin_reg1.awk b/lin_reg1.awk index cf3e69a..8b03e2f 100755..100644 --- a/lin_reg1.awk +++ b/lin_reg1.awk @@ -4,7 +4,7 @@ # simple linear regression between columns BEGIN { - OFS = "%.18g" + OFMT = "%.18g" sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" @@ -13,13 +13,11 @@ BEGIN { } NR == 1 { - header_nf = NF - for (n=1; n<=NF; n++) { - ($n !~ number) ? header[n] = $n : header[n] = "col" n - } + for (n=1; n<=NF; n++) + ($n ~ number) ? header[n] = "col" n : header[n] = $n } -NF != 0 { +NF > 0 { if (NF > nf_max) nf_max = NF @@ -44,6 +42,7 @@ NF != 0 { sum_xy[x,y] += $x*$y sum_delta_xy[x,y] += delta[x]*delta[y] + # correlation r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y]) (r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 0 diff --git a/lin_reg2.awk b/lin_reg2.awk index dea52b5..7c00fe8 100644 --- a/lin_reg2.awk +++ b/lin_reg2.awk @@ -4,7 +4,7 @@ # simple linear regression between columns BEGIN { - OFS = "%.18g" + OFMT = "%.18g" sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" @@ -13,13 +13,11 @@ BEGIN { } NR == 1 { - header_nf = NF - for (n=1; n<=NF; n++) { - ($n !~ number) ? header[n] = $n : header[n] = "col" n - } + for (n=1; n<=NF; n++) + ($n ~ number) ? header[n] = "col" n : header[n] = $n } -NF != 0 { +NF > 0 { if (NF > nf_max) nf_max = NF @@ -39,7 +37,7 @@ NF != 0 { sum_delta2[y] += delta[y]*delta[y] ### sample variance - (count[y] - 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = 0 + (count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = "" # x = row, y = col for (x=1; x<=nf_max; x++) { @@ -4,7 +4,7 @@ # calculate mean average
BEGIN {
- OFS = "%.18g"
+ OFMT = "%.18g"
sign = "[+-±]?"
decimal = "[0-9]+[.]?[0-9]*"
fraction = "[.][0-9]*"
@@ -33,3 +33,4 @@ END { print "mean", "std_dev", "std_err", "count"
print mean, sqrt(var), sqrt(var/count), count
}
+
diff --git a/mean_avg.awk b/mean_avg.awk index 6f5a270..565dcc2 100644 --- a/mean_avg.awk +++ b/mean_avg.awk @@ -4,20 +4,17 @@ # average columns of numerical data BEGIN { - OFS = "%.18g" - sign = "[+-±]?" - decimal = "[0-9]+[.]?[0-9]*" - fraction = "[.][0-9]*" - exponent = "([Ee]" sign "[0-9]+)?" - number = "^" sign "(" decimal "|" fraction ")" exponent "$" + OFMT = "%.18g" + #sign = "[+-]?" + #decimal = "[0-9]+[.]?[0-9]*" + #fraction = "[.][0-9]*" + #exponent = "([Ee]" "[+-]?" "[0-9]+)?" + number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]*)([Ee][+-]?[0-9]+)?$" } NR == 1 { - header_nf = NF - for (n=1; n<=NF; n++) { - if ($n !~ number) - header[n] = $n - } + for (n=1; n<=NF; n++) + ($n ~ number) ? header[n] = "col" n : header[n] = $n } # Welford's 'online' algorithm for variance @@ -28,10 +25,10 @@ NF > 0 { if ($n ~ number) { count[n] += 1 delta0[n] = $n - mean[n] - mean[n] = mean[n] + delta0[n]/count[n] + mean[n] += delta0[n]/count[n] delta1[n] = $n - mean[n] - M2[n] = M2[n] + delta0[n]*delta1[n] - (count[n] > 1) ? var[n] = M2[n]/(count[n] - 1) : var[n] = "0" + M2[n] += delta0[n]*delta1[n] + (count[n] > 1) ? var[n] = M2[n]/(count[n] - 1) : var[n] = "" } } } @@ -42,7 +39,7 @@ END { if (header[n]) print header[n], mean[n], sqrt(var[n]), sqrt(var[n]/count[n]), count[n] else - print "col_" n, mean[n], sqrt(var[n]), sqrt(var[n]/count[n]), count[n] + print "col" n, mean[n], sqrt(var[n]), sqrt(var[n]/count[n]), count[n] } } |
