This notebook documents the data ingestion stage of the housing affordability analysis pipeline. Its purpose is to retrieve, validate, and store raw data from multiple public sources in a consistent and reproducible manner. All subsequent transformation, analysis, and modeling steps depend on the integrity of the data collected here.
The ingestion process integrates data from three primary sources. Zillow Home Value Index (ZHVI) data provide monthly measures of median home values by metropolitan area. Macroeconomic indicators, including mortgage interest rates and inflation, are sourced from the Federal Reserve Economic Data (FRED) API. Median household income data are obtained from the U.S. Census Bureau’s American Community Survey (ACS). Together, these datasets form the foundation for constructing a longitudinal panel of housing market conditions across U.S. metropolitan areas.
This notebook focuses exclusively on data acquisition and initial validation. It performs no feature engineering or analytical transformations beyond those required to standardize variable names, date formats, and geographic identifiers. Each ingestion step includes basic verification checks to confirm data completeness and expected ranges before writing outputs to disk.
All raw data generated by this notebook are saved to the
data/raw/ directory and are intentionally excluded from
version control. Reproducibility is ensured through the use of API-based
retrieval, environment-managed dependencies, and explicit documentation
of data sources and parameters. Downstream notebooks rely on these raw
files as immutable inputs for transformation and analysis.
library(tidyverse)
library(lubridate)
library(broom)
library(corrplot)
library(fredr)
set.seed(42)
library(tidycensus)
census_api_key(Sys.getenv("CENSUS_API_KEY"))
library(fredr)
fredr_set_key(Sys.getenv("FRED_API_KEY"))
# --- Helper: pull a FRED series into a tidy tibble ---
get_fred_series <- function(series_id, start_date = as.Date("2000-01-01")) {
fredr::fredr(
series_id = series_id,
observation_start = start_date
) %>%
select(date, value) %>%
rename(!!series_id := value)
}
# --- Choose series to ingest ---
# Commonly useful for housing affordability:
series_ids <- c(
"MORTGAGE30US", # 30-year fixed mortgage rate (weekly)
"CSUSHPISA", # Case-Shiller US HPI (monthly)
"CPIAUCSL" # CPI (monthly)
)
start_date <- as.Date("2000-01-01")
verify_fred_df <- function(df) {
#basic ingestion checks
stopifnot("date" %in% names(df))
stopifnot(nrow(df) > 0)
stopifnot(is.Date(df$date))
value_col <- setdiff(names(df), "date")
stopifnot(length(value_col) == 1)
tibble(
series_id = value_col,
n_rows = nrow(df),
start_date = min(df$date, na.rm = TRUE),
end_date = max(df$date, na.rm = TRUE),
n_missing = sum(is.na(df[[value_col]])),
n_dup_dates = sum(duplicated(df$date))
)
}
ingestion_report <- map_dfr(series_ids, function(sid){
df <- get_fred_series(sid,start_date = start_date)
#verification summary
report <- verify_fred_df(df)
#quick peek (optional)
message("\n--",sid," ---")
print(head(df,3))
print(tail(df,3))
print(report)
out_path <- file.path("./data/raw", paste0("fred_", sid,"_",format(start_date, "%Y%m%d"),".csv"))
write_csv(df, out_path)
report %>% mutate(saved_to = out_path)
})
# A tibble: 3 × 2
date MORTGAGE30US
<date> <dbl>
1 2000-01-07 8.15
2 2000-01-14 8.18
3 2000-01-21 8.26
# A tibble: 3 × 2
date MORTGAGE30US
<date> <dbl>
1 2025-12-31 6.15
2 2026-01-08 6.16
3 2026-01-15 6.06
# A tibble: 1 × 6
series_id n_rows start_date end_date n_missing n_dup_dates
<chr> <int> <date> <date> <int> <int>
1 MORTGAGE30US 1359 2000-01-07 2026-01-15 0 0
# A tibble: 3 × 2
date CSUSHPISA
<date> <dbl>
1 2000-01-01 101.
2 2000-02-01 101.
3 2000-03-01 102.
# A tibble: 3 × 2
date CSUSHPISA
<date> <dbl>
1 2025-08-01 327.
2 2025-09-01 328.
3 2025-10-01 329.
# A tibble: 1 × 6
series_id n_rows start_date end_date n_missing n_dup_dates
<chr> <int> <date> <date> <int> <int>
1 CSUSHPISA 310 2000-01-01 2025-10-01 0 0
# A tibble: 3 × 2
date CPIAUCSL
<date> <dbl>
1 2000-01-01 169.
2 2000-02-01 170
3 2000-03-01 171
# A tibble: 3 × 2
date CPIAUCSL
<date> <dbl>
1 2025-10-01 NA
2 2025-11-01 325.
3 2025-12-01 326.
# A tibble: 1 × 6
series_id n_rows start_date end_date n_missing n_dup_dates
<chr> <int> <date> <date> <int> <int>
1 CPIAUCSL 312 2000-01-01 2025-12-01 1 0
ingestion_report
NA
# --- Helper: ACS median household income (B19013) by Metro (CBSA) ---
get_acs_median_hh_income_metro <- function(year = 2023) {
df <- get_acs(
geography = "metropolitan statistical area/micropolitan statistical area",
table = "B19013",
year = year,
survey = "acs5",
cache_table = TRUE
) %>%
transmute(
year = year,
geo_id = GEOID,
region = NAME,
income_median_hh = estimate,
moe = moe
)
df
}
#---Pull data ---
YEAR <- 2023
income_metro <- get_acs_median_hh_income_metro(year = YEAR)
#---Verification---
stopifnot(nrow(income_metro) > 0)
stopifnot(all(c("geo_id","region","income_median_hh") %in% names(income_metro)))
verification <- income_metro %>%
summarise(
year = first(year),
n_rows = n(),
n_missing_income = sum(is.na(income_median_hh)),
min_income = min(income_median_hh, na.rm = TRUE),
max_income = max(income_median_hh, na.rm = TRUE)
)
print(head(income_metro, 5))
# A tibble: 5 × 5
year geo_id region income_median_hh moe
<dbl> <chr> <chr> <dbl> <dbl>
1 2023 10100 Aberdeen, SD Micro Area 70761 3303
2 2023 10140 Aberdeen, WA Micro Area 63539 2689
3 2023 10180 Abilene, TX Metro Area 66464 2039
4 2023 10220 Ada, OK Micro Area 62564 4420
5 2023 10300 Adrian, MI Micro Area 67013 1691
print(verification)
# A tibble: 1 × 5
year n_rows n_missing_income min_income max_income
<dbl> <int> <int> <dbl> <dbl>
1 2023 935 0 17624 157444
#---Save to raw---
out_path <- file.path("./data/raw",paste0("ac5_median_hh_income_metro_",YEAR,".csv"))
write_csv(income_metro, out_path)
message("Saved:", out_path)
library(tidyverse)
library(tidycensus)
library(readr)
# --- Helper: ACS median household income (B19013) by Metro (CBSA), multi-year ---
get_acs_median_hh_income_metro <- function(years = 2009:2023) {
stopifnot(length(years) > 0)
df <- purrr::map_dfr(years, function(y) {
get_acs(
geography = "metropolitan statistical area/micropolitan statistical area",
table = "B19013",
year = y,
survey = "acs5",
cache_table = TRUE
) %>%
transmute(
year = y,
geo_id = GEOID,
region = NAME,
income_median_hh = estimate,
moe = moe
)
})
df
}
# --- Pull data ---
YEARS <- 2009:2023
income_metro <- get_acs_median_hh_income_metro(years = YEARS)
# --- Verification ---
stopifnot(nrow(income_metro) > 0)
stopifnot(all(c("year", "geo_id", "region", "income_median_hh") %in% names(income_metro)))
verification <- income_metro %>%
summarise(
year_min = min(year),
year_max = max(year),
n_rows = n(),
n_regions = n_distinct(region),
n_missing_income = sum(is.na(income_median_hh)),
min_income = min(income_median_hh, na.rm = TRUE),
max_income = max(income_median_hh, na.rm = TRUE)
)
print(head(income_metro, 5))
# A tibble: 5 × 5
year geo_id region income_median_hh moe
<int> <chr> <chr> <dbl> <dbl>
1 2009 10020 Abbeville, LA Micro Area 40785 1650
2 2009 10100 Aberdeen, SD Micro Area 45108 1529
3 2009 10140 Aberdeen, WA Micro Area 41618 1674
4 2009 10180 Abilene, TX Metro Area 40715 1343
5 2009 10220 Ada, OK Micro Area 36446 1565
print(verification)
# A tibble: 1 × 7
year_min year_max n_rows n_regions n_missing_income min_income max_income
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2009 2023 14130 1262 1 10499 157444
# Optional: verify coverage by year
coverage_by_year <- income_metro %>%
group_by(year) %>%
summarise(
n_regions = n_distinct(region),
pct_missing_income = mean(is.na(income_median_hh)),
.groups = "drop"
)
print(head(coverage_by_year, 10))
# A tibble: 10 × 3
year n_regions pct_missing_income
<int> <int> <dbl>
1 2009 953 0
2 2010 955 0
3 2011 955 0
4 2012 955 0
5 2013 929 0
6 2014 929 0
7 2015 929 0
8 2016 945 0
9 2017 945 0
10 2018 945 0.00106
# --- Save to raw ---
dir.create("data/raw", recursive = TRUE, showWarnings = FALSE)
out_path <- file.path(
"data/raw",
paste0("acs5_median_hh_income_metro_", min(YEARS), "_", max(YEARS), ".csv")
)
write_csv(income_metro, out_path)
message("Saved: ", out_path)