#!/usr/bin/awk ### mean_avg.awk # average columns of numerical data BEGIN { sign = "[+-±]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" exponent = "([Ee]" sign "[0-9]+)?" number = "^" sign "(" decimal "|" fraction ")" exponent "$" } NR == 1 { header_nf = NF for (n=1; n<=NF; n++) { if ($n !~ number) header[n] = $n } } # Welford's 'online' algorithm for variance NF != 0 { if (NF > max_nf) max_nf = NF for (n=1; n <= NF; n++) { if ($n ~ number) { count[n] += 1 delta0[n] = $n - mean[n] mean[n] = mean[n] + delta0[n]/count[n] delta1[n] = $n - mean[n] M2[n] = M2[n] + delta0[n]*delta1[n] if (count[n] > 1) var[n] = M2[n]/(count[n] - 1) else var[n] = "0" } } } END { printf("\n") printf("%s, \t%s, %s, %s, %s\n", "col", "mean", "std_dev", "std_err", "count") for (n=1; n<=max_nf; n++) { if (header[n]) printf("%s, \t", header[n]) else printf("col_%g, \t", n) if (count[n]) { printf("%g, ±%g, ±%g, %g\n", mean[n], sqrt(var[n]), 1.96*sqrt(var[n]/count[n]), count[n]) } else printf(",,,,\n") } }