#!/usr/bin/awk -f ### mean_avg.awk # average columns of numerical data BEGIN { OFS = FS #sign = "[+-]?" #decimal = "[0-9]+[.]?[0-9]*" #fraction = "[.][0-9]*" #exponent = "([Ee]" "[+-]?" "[0-9]+)?" number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]*)([Ee][+-]?[0-9]+)?$" } NR == 1 { for (n=1; n<=NF; n++) ($n ~ number) ? header[n] = "col" n : header[n] = $n } # Welford's 'online' algorithm for variance NF > 0 { if (NF > max_nf) max_nf = NF for (n=1; n <= NF; n++) { if ($n ~ number) { count[n] += 1 (count[n] == 1 || $n < min[n]) ? min[n] = $n : min[n] = min[n] (count[n] == 1 || $n > max[n]) ? max[n] = $n : max[n] = max[n] delta0[n] = $n - mean[n] mean[n] += delta0[n]/count[n] delta1[n] = $n - mean[n] M2[n] += delta0[n]*delta1[n] (count[n] > 1) ? var[n] = M2[n]/(count[n] - 1) : var[n] = "nan" } } } END { print "col", "mean", "std_err", "std_dev", "min", "max", "count" for (n=1; n<=max_nf; n++) { if (header[n]) print header[n], mean[n], sqrt(var[n]/count[n]), sqrt(var[n]), min[n], max[n], count[n] else print "col" n, mean[n], sqrt(var[n]/count[n]), sqrt(var[n]), min[n], max[n], count[n] } }