From 7c58098c9f41c405ef07264d2ea5c006578e60d9 Mon Sep 17 00:00:00 2001 From: wukong Date: Sat, 9 Jun 2018 16:32:52 -0700 Subject: updated mean_avg to use a one-pass online algorithm for mean and variance --- mean_avg.awk | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/mean_avg.awk b/mean_avg.awk index 799f96c..23b1e73 100644 --- a/mean_avg.awk +++ b/mean_avg.awk @@ -4,7 +4,7 @@ # average columns of numerical data BEGIN { - sign = "[+-]?" + sign = "[+-±]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" exponent = "([Ee]" sign "[0-9]+)?" @@ -19,44 +19,39 @@ NR == 1 { } } +# Welford's 'online' algorithm for variance NF != 0 { if (NF > max_nf) max_nf = NF for (n=1; n <= NF; n++) { - if ($n !~ number) { - continue - } if ($n ~ number) { count[n] += 1 - sum[n] += $n - sum2[n] += $n*$n - mean[n] = sum[n]/count[n] - delta[n] = $n - mean[n] - delta2[n] = delta[n]*delta[n] - sum_delta[n] += delta[n] - sum_delta2[n] += delta2[n] - if ((count[n] - 1) != 0) - var[n] = sum_delta2[n]/(count[n] - 1) + delta0[n] = $n - mean[n] + mean[n] = mean[n] + delta0[n]/count[n] + delta1[n] = $n - mean[n] + M2[n] = M2[n] + delta0[n]*delta1[n] + if (count[n] > 1) + var[n] = M2[n]/(count[n] - 1) else - var[n] = 0 + var[n] = "0" } } } END { printf("\n") - printf("%-6s\t%-6s %-6s %-6s\n", "col", "mean", "std_err", "count") + printf("%s, \t%s, %s, %s, %s\n", "col", "mean", "std_dev", "std_err", "count") for (n=1; n<=max_nf; n++) { if (header[n]) - printf("%-6s\t", header[n]) + printf("%s, \t", header[n]) else - printf("%-6g\t", n) + printf("col_%g, \t", n) if (count[n]) { - printf("%-6g ±%-6g %-6g\n", - mean[n], 1.96*sqrt(var[n]/count[n]), count[n]) + printf("%g, ±%g, ±%g, %g\n", + mean[n], sqrt(var[n]), 1.96*sqrt(var[n]/count[n]), count[n]) } else - printf("%2s %2s %2s\n", " ", " ", " ") + printf(",,,,\n") } } -- cgit v1.2.3