summaryrefslogtreecommitdiff
path: root/mean_avg.awk
blob: bef984d332ac7ead5bd5fb2a0d96fec3aae1891d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/awk -f

### mean_avg.awk
# average columns of numerical data
# input: delimited data as text
# output: list of univariate summary stats

BEGIN {
    OFS = FS
    #sign = "[+-]?"
    #decimal = "[0-9]+[.]?[0-9]*"
    #fraction = "[.][0-9]*"
    #exponent = "([Ee]" "[+-]?" "[0-9]+)?"
    number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]*)([Ee][+-]?[0-9]+)?$"
}

NR == 1 {
    for (n=1; n<=NF; n++)
        ($n ~ number) ? header[n] = "col" n : header[n] = $n
}

# Welford's 'online' algorithm for variance
NF {
    if (NF > max_nf)
        max_nf = NF
    for (n=1; n <= NF; n++) {
        if ($n ~ number) {
            count[n] += 1
            (count[n] == 1 || $n < min[n]) ? min[n] = $n : min[n] = min[n]
            (count[n] == 1 || $n > max[n]) ? max[n] = $n : max[n] = max[n]
            range[n] = max[n] - min[n]
            delta0[n] = $n - mean[n]
            mean[n] += delta0[n]/count[n]
            delta1[n] = $n - mean[n]
            M2[n] += delta0[n]*delta1[n]
            (count[n] > 1) ? var[n] = M2[n]/(count[n] - 1) : var[n] = "nan"
        }
    }
}

END {
    print "col", "mean", "std_err", "std_dev", "range", "min", "max", "total", "count"
    for (n=1; n<=max_nf; n++) {
        if (header[n])
            print header[n], mean[n], sqrt(var[n]/count[n]), sqrt(var[n]), range[n], min[n], max[n], mean[n]*count[n], count[n]
        else
            print "col" n, mean[n], sqrt(var[n]/count[n]), sqrt(var[n]), range[n], min[n], max[n], mean[n]*count[n], count[n]
    }
}