The following are examples of exploratory plots generated from
meteorological data for Waterville, Maine (USA). Data were pulled from
ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/ then
saved in a native R file format named met1991_2023.Rds
available on this repo using the
Meteo_lite.R
script (also available on this repo).
The dataset spans the years 1991 through 2023. A data summary follows.
library(MASS)
library(dplyr)
library(lubridate)
library(ggplot2)
dat <- readRDS("met1991_2023.Rds")
stargazer::stargazer(as.data.frame(dat), type = "html", flip = FALSE,
title="Descriptive statistics", digits = 2,
mean.sd = FALSE, no.space = FALSE )
Statistic | N | Min | Max |
Temp | 228,499 | -22.00 | 98.60 |
DP | 227,651 | -32.80 | 80.60 |
Speed | 228,561 | 0.00 | 40.26 |
Dir | 220,483 | 0 | 360 |
Press | 68,414 | 974.20 | 1,048.20 |
Precip_1hr | 64,561 | -0.004 | 10.00 |
Precip_6hr | 8,403 | -0.004 | 10.00 |
With a large temporal dataset, it’s always a good idea to scan for missing values. The following plot is a calendar heatmap showing the percentage of missing temperature values. Each block element represents a day.
dat %>% mutate(missing = as.numeric(is.na(Temp)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
year = year(Date_EST),
weekday = wday(Date_EST,label=TRUE),
monthw = ceiling(day(Date_EST) / 7) ) %>%
group_by(month,year,weekday,monthw) %>%
summarise(percent = mean(missing) * 100) %>%
ggplot() + aes( weekday, monthw,fill = percent) +
geom_tile(colour = "white") +
facet_grid(year~month) +
scale_fill_gradient(low="green", high="red") +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
strip.text=element_text(angle=0),
strip.text.y=element_text(angle=0),
panel.spacing = unit(0, "lines"))
There’s a large swath of missing values for the years 1996 through 1999 and for the year 2001.
Next, we’ll generate a median temperature heatmap (in degrees Fahrenheit) but limit the output to days with fewer than 70% missing values. Days for which more than 70% of values are missing are assigned a grey color.
dat %>% mutate(missing = as.numeric(is.na(Temp)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
year = year(Date_EST),
weekday = wday(Date_EST,label=TRUE),
monthw = ceiling(day(Date_EST) / 7) ) %>%
group_by(month,year,weekday,monthw) %>%
summarise(percent = mean(missing) * 100,
Temp = ifelse(percent > 70, NA, median(Temp, na.rm=TRUE))) %>%
ggplot() + aes( weekday,monthw, fill = Temp) +
geom_tile(colour = "white") +
facet_grid(year~month) +
scale_fill_gradient2(low="blue", mid="yellow", high="red",
na.value = "grey70", midpoint = 40) +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
strip.text.y=element_text(angle=0),
panel.spacing = unit(0, "lines"))
The following heatmap applies a loess
smooth surface to the data. This technique helps visualize
overall patterns in the two dimensional dataset. As small loess span (~
0.04
) is adopted to help capture small (time) scale
variations on the data. To avoid edge effects at the beginning and end
of each year, the data are wrapped when fitting a loess surface. But
wrapping is not applied to the ends of the date ranges (i.e. prior to
2003 and after 2023). Note that because of the large swaths of missing
values, data are restricted to 2003 and up.
dat.loess <- dat %>%
select(Temp, Date_EST) %>%
na.omit() %>%
filter(year(Date_EST) > 2002) %>%
mutate(date = as.Date(Date_EST)) %>%
group_by(date) %>%
summarise(temp = mean(Temp, na.rm = TRUE)) %>%
ungroup() %>%
mutate(dec.year = decimal_date(date),
year = floor(dec.year),
day= yday(date)) %>%
select(-dec.year)
# Wrap dates to correct for temporal edge effects
dat.loess.pre <- dat.loess %>%
filter(day > 250) %>%
mutate(year = year + 1,
day = day - 366)
dat.loess.post <- dat.loess %>%
filter(day < 100) %>%
mutate(year = year - 1,
day = day + 366)
dat.lowess.wrap <- rbind(dat.loess, dat.loess.pre, dat.loess.post)
dat.smooth <- loess(temp ~ year + day, dat.lowess.wrap,
span = 0.04, degree = 2,
family = "symmetric")
smooth.df <- data.frame(dat.smooth$x, dat.smooth$fitted) %>%
filter(day > 0 & day < 366)
ggplot(smooth.df) + aes( day , year,
fill = `dat.smooth.fitted`,
z = `dat.smooth.fitted`) +
geom_raster(interpolate = TRUE) +
# scale_fill_gradientn(colours = terrain.colors(10)) +
scico::scale_fill_scico(palette = "romaO", direction = -1, name = "Mean Temperature") +
geom_contour(size = 0.2, color = "#EE9A00",
breaks = seq(0, 100, by = 7)) +
geom_vline(xintercept = c(80, 172, 264), col = "grey90", lty=2 ) +
scale_x_continuous(breaks = c(80, 172, 264),
labels = c("Mar 21", "Jun 21", "Sep 21"),
name = NULL) +
scale_y_reverse()
A few noteworthy observations is the increase in mean temperatures for the winter months and an increase in temperatures around mid-summer.
The following heatmap shows the difference between reported daily median temperature (in degrees Fahrenheit) and the ~30 year median temperature for that day.
dat.normalized <- dat %>%
mutate(missing = as.numeric(is.na(Temp)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
year = year(Date_EST),
weekday = wday(Date_EST,label=TRUE),
monthw = ceiling(day(Date_EST) / 7) ) %>%
group_by(month,year,weekday,monthw) %>%
summarise(percent = mean(missing) * 100,
Temp = ifelse(percent > 70, NA, median(Temp, na.rm=TRUE))) %>%
ungroup() %>%
group_by(month,monthw,weekday) %>%
mutate(`Temperature anomaly` = Temp - median(Temp, na.rm=TRUE))
ggplot(dat.normalized) + aes( weekday,monthw, fill = `Temperature anomaly`) +
geom_tile(colour = "white") +
facet_grid(year ~ month) +
scale_fill_stepsn(colors = c("#455DB5", "#4575B4", "#74ADD1",
"#FDAE61", "#F46D43", "#D73027") ,
breaks = c(-20, -10, 0, 10, 20),
na.value = "grey90") +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
strip.text.y=element_text(angle=0),
panel.spacing = unit(0, "lines"))
The following plots show median monthly temperatures per year. Only months with 30% or fewer missing values are displayed. A resistant straight line and a resistant LOESS curve are fitted to the data.
dat %>% mutate(missing = as.numeric(is.na(Temp)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
Year = year(Date_EST)) %>%
# filter(month %in% c("Feb","May","Aug","Oct") )%>%
group_by(month,Year) %>%
summarise(percent = mean(missing) * 100,
Temp = ifelse(percent > 30, NA, median(Temp, na.rm=TRUE))) %>%
ggplot() + aes(x=Year, y=Temp) +geom_point() +
facet_wrap(month~., scales="free", ncol = 4) +
geom_smooth(method = "loess", se=FALSE, size=0.5,
method.args = list(family="symmetric")) +
geom_smooth(method="rlm", colour="red", se=FALSE, size=0.5) +
ggtitle(expression("Median yearly temperatures (in " * degree * "F)"))
The missing values for the years 1996 through 1999 create a gap in the data which could influence the LOESS curve however, the straight line fit should be resistant to the imbalanced distribution of values.
The following plots show the median monthly top 10% temperature values per year. Only months with 30% or fewer missing values are displayed. A resistant straight line and a resistant LOESS curve are fitted to the data.
dat %>% mutate(missing = as.numeric(is.na(Temp)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
Year = year(Date_EST)) %>%
# filter(month %in% c("Feb","May","Aug","Oct")) %>%
group_by(month,Year) %>%
summarise(percent = mean(missing) * 100,
topTemp = median(Temp[percent_rank(Temp) > 0.9], na.rm=TRUE),
Temp = ifelse(percent > 30, NA, topTemp)) %>%
ggplot() + aes(x=Year, y=Temp) +geom_point() +
facet_wrap(month~., scales="free", ncol = 4) +
geom_smooth(method="loess", se=FALSE, size=0.5,
method.args = list(family="symmetric")) +
geom_smooth(method="rlm", colour="red", se=FALSE, size=0.5) +
ggtitle(expression("Median top 10% temperature (in " * degree * "F)"))
The following plots show the median monthly bottom 10% temperature values per year. Only months with 30% or fewer missing values are displayed. A resistant straight line and a resistant LOESS curve are fitted to the data.
dat %>% mutate(missing = as.numeric(is.na(Temp)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
Year = year(Date_EST)) %>%
# filter(month %in% c("Feb","May","Aug","Oct")) %>%
group_by(month,Year) %>%
summarise(percent = mean(missing) * 100,
topTemp = median(Temp[percent_rank(Temp) < 0.1], na.rm=TRUE),
Temp = ifelse(percent > 30, NA, topTemp)) %>%
ggplot() + aes(x=Year, y=Temp) +geom_point() +
facet_wrap(month~., scales="free", ncol = 4) +
geom_smooth(method="loess", se=FALSE, size=0.5,
method.args = list(family="symmetric")) +
geom_smooth(method="rlm", colour="red", se=FALSE, size=0.5) +
ggtitle(expression("Median bottom 10% temperature (in " * degree * "F)"))
The following plots show median monthly dewpoints per year. Only months with 30% or fewer missing values are displayed. A resistant straight line and a resistant LOESS curve are fitted to the data.
dat %>% mutate(missing = as.numeric(is.na(DP)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
Year = year(Date_EST)) %>%
# filter(month %in% c("Feb","May","Aug","Oct") )%>%
group_by(month,Year) %>%
summarise(percent = mean(missing) * 100,
DP = ifelse(percent > 30, NA, median(DP, na.rm=TRUE))) %>%
ggplot() + aes(x=Year, y=DP) +geom_point() +
facet_wrap(month~., scales="free", ncol = 4) +
geom_smooth(method = "loess", se=FALSE, size=0.5,
method.args = list(family="symmetric")) +
geom_smooth(method="rlm", colour="red", se=FALSE, size=0.5) +
ggtitle(expression("Median yearly dewpoint (in " * degree * "F)"))
The following plots show the median monthly top 10% dewpoint values per year. Only months with 30% or fewer missing values are displayed. A resistant straight line and a resistant LOESS curve are fitted to the data.
dat %>% mutate(missing = as.numeric(is.na(DP)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
Year = year(Date_EST)) %>%
# filter(month %in% c("Feb","May","Aug","Oct")) %>%
group_by(month,Year) %>%
summarise(percent = mean(missing) * 100,
topDP = median(DP[percent_rank(DP) > 0.9], na.rm=TRUE),
DP = ifelse(percent > 30, NA, topDP)) %>%
ggplot() + aes(x=Year, y=DP) +geom_point() +
facet_wrap(month~., scales="free", ncol = 4) +
geom_smooth(method="loess", se=FALSE, size=0.5,
method.args = list(family="symmetric")) +
geom_smooth(method="rlm", colour="red", se=FALSE, size=0.5) +
ggtitle(expression("Median top 10% dewpoint (in " * degree * "F)"))
The following plots show the median monthly bottom 10% dewpoint values per year. Only months with 30% or fewer missing values are displayed. A resistant straight line and a resistant LOESS curve are fitted to the data.
dat %>% mutate(missing = as.numeric(is.na(DP)),
month = month(Date_EST, label=TRUE,abbr=TRUE),
Year = year(Date_EST)) %>%
# filter(month %in% c("Feb","May","Aug","Oct")) %>%
group_by(month,Year) %>%
summarise(percent = mean(missing) * 100,
botDP = median(DP[percent_rank(DP) < 0.1], na.rm=TRUE),
DP = ifelse(percent > 30, NA, botDP)) %>%
ggplot() + aes(x=Year, y=DP) +geom_point() +
facet_wrap(month ~ ., scales="free", ncol = 4) +
geom_smooth(method="loess", se=FALSE, size=0.5,
method.args = list(family="symmetric")) +
geom_smooth(method="rlm", colour="red", se=FALSE, size=0.5) +
ggtitle(expression("Median bottom 10% dewpoint (in " * degree * "F)"))
Manuel Gimond, 2023