I couldn’t find anything on Google about people doing linear regression with influxdb, and it doesn’t appear to be supported natively (yet), so I wrote a query for it so we could forecast time series and predict disk failure. I’ve included the query below in case anybody else finds it useful. Sorry it is so messy!
SELECT
((100 - b) / slope) as days_until_full,
slope,
b
FROM
(
SELECT
(n * sum_xy - sum_x * sum_y) / (n * sum_xx - pow(sum_x,2)) * 24 as slope, -- convert hours -> days
y_bar - x_bar * (n * sum_xy - sum_x * sum_y) / (n * sum_xx - pow(sum_x,2)) as b,
n / 24 -- convert hours -> days
FROM (
SELECT
count(y) as n,
sum(x) as sum_x,
sum(y) as sum_y,
sum(x_times_y) as sum_xy,
sum(x_times_x) as sum_xx,
mean(x) as x_bar,
mean(y) as y_bar
FROM
(
SELECT
CUMULATIVE_SUM(elapsed(mean(metric_value), 1h)) as x,
pow(CUMULATIVE_SUM(elapsed(mean(metric_value), 1h)), 2) as x_times_x,
mean(metric_value) as y,
CUMULATIVE_SUM(elapsed(mean(metric_value), 1h)) * mean(metric_value) as x_times_y
FROM "servicestatus"
WHERE
(
"check_command" =~ /check_(snmp_)?disk/
AND metric_name =~ /Percent Usage (Reported|Calculated) % /
)
AND $timeFilter
GROUP BY
metric_name,
host_name,
time(1h)
)
group by
metric_name,
host_name
)
group by
metric_name,
host_name
)
where
slope != 0 or slope = 0 -- bootleg version of filtering out nulls
group by
metric_name,
host_name