Tutorials
This page provides step-by-step tutorials for common use cases with EurostatAPI.jl.
Tutorial 1: Your First Eurostat Dataset
In this tutorial, we'll walk through the basics of fetching and exploring a Eurostat dataset.
Step 1: Setup
First, make sure you have EurostatAPI.jl installed and loaded:
using EurostatAPI
using DataFrames
using Statistics # For basic statistical functions
Step 2: Choose a Dataset
For this tutorial, we'll use the European GDP dataset (nama_10_gdp
). This dataset contains national accounts data including GDP figures for EU countries.
# Fetch GDP data for 2022
df = fetch_dataset("nama_10_gdp", 2022)
Step 3: Explore the Data Structure
Let's examine what we received:
# Check the dimensions
println("Dataset size: $(size(df))")
println("Columns: $(names(df))")
# Look at the first few rows
first(df, 3)
Step 4: Understand the Dimensions
# Find the dimension columns (excluding metadata columns)
metadata_cols = [:dataset, :year, :value, :original_value, :fetch_date, :original_key]
dimension_cols = filter(col -> !(col in metadata_cols), names(df))
println("Available dimensions: $dimension_cols")
# Explore each dimension
for col in dimension_cols
unique_vals = unique(skipmissing(df[!, col]))
println("$col has $(length(unique_vals)) unique values")
if length(unique_vals) <= 10
println(" Values: $unique_vals")
else
println(" Sample values: $(first(unique_vals, 5))...")
end
end
Step 5: Filter for Meaningful Data
Most Eurostat datasets have multiple indicators. Let's filter for actual GDP values:
# Look for GDP-related indicators
if :na_item in names(df)
gdp_indicators = unique(filter(!ismissing, df.na_item))
println("Available GDP indicators:")
for indicator in gdp_indicators
count = nrow(filter(row -> row.na_item == indicator, df))
println(" $indicator: $count records")
end
# Filter for GDP at market prices (B1GQ)
gdp_data = filter(row ->
!ismissing(row.na_item) &&
occursin("B1GQ", row.na_item) &&
!ismissing(row.value), df)
println("\nGDP at market prices: $(nrow(gdp_data)) records")
end
Step 6: Analyze by Country
if :geo in names(gdp_data) && nrow(gdp_data) > 0
# Group by country and calculate statistics
country_stats = combine(groupby(gdp_data, :geo),
:value => mean => :avg_gdp,
:value => length => :count)
sort!(country_stats, :avg_gdp, rev=true)
println("Top 10 countries by GDP:")
first(country_stats, 10)
end
Tutorial 2: Time Series Analysis
In this tutorial, we'll collect data across multiple years and analyze trends.
Step 1: Collect Multi-Year Data
function collect_gdp_timeseries(years)
all_data = DataFrame()
for year in years
try
println("Fetching data for $year...")
yearly_data = fetch_dataset("nama_10_gdp", year)
# Filter for GDP at market prices
gdp_yearly = filter(row ->
!ismissing(row.na_item) &&
occursin("B1GQ", row.na_item) &&
!ismissing(row.value), yearly_data)
append!(all_data, gdp_yearly)
println(" Added $(nrow(gdp_yearly)) GDP records")
catch e
println(" Failed to get data for $year: $e")
end
# Be nice to the API
sleep(1)
end
return all_data
end
# Collect data for recent years
recent_years = 2019:2023
gdp_timeseries = collect_gdp_timeseries(recent_years)
Step 2: Analyze Growth Trends
function calculate_growth_rates(data, country_code)
# Filter for specific country
country_data = filter(row ->
!ismissing(row.geo) && row.geo == country_code, data)
if nrow(country_data) == 0
return nothing
end
# Sort by year
sort!(country_data, :year)
# Calculate year-over-year growth
growth_data = DataFrame()
for i in 2:nrow(country_data)
prev_value = country_data[i-1, :value]
curr_value = country_data[i, :value]
if !ismissing(prev_value) && !ismissing(curr_value) && prev_value != 0
growth_rate = (curr_value - prev_value) / prev_value * 100
push!(growth_data, (
country = country_code,
year = country_data[i, :year],
gdp = curr_value,
growth_rate = growth_rate
))
end
end
return growth_data
end
# Calculate growth for major EU countries
major_countries = ["DE", "FR", "IT", "ES", "NL"]
all_growth = DataFrame()
for country in major_countries
growth = calculate_growth_rates(gdp_timeseries, country)
if growth !== nothing
append!(all_growth, growth)
end
end
# Show average growth by country
if nrow(all_growth) > 0
avg_growth = combine(groupby(all_growth, :country),
:growth_rate => mean => :avg_growth,
:growth_rate => std => :growth_volatility)
sort!(avg_growth, :avg_growth, rev=true)
println("Average GDP growth rates:")
avg_growth
end
Tutorial 3: Working with Population Data
Let's explore demographic data using the population dataset.
Step 1: Fetch Population Data
# Population on 1 January by age and sex
pop_data = fetch_dataset("demo_pjan", 2023)
println("Population dataset dimensions: $(size(pop_data))")
Step 2: Explore Population Structure
# Check available dimensions
if :age in names(pop_data) && :sex in names(pop_data)
println("Age groups available:")
age_groups = unique(filter(!ismissing, pop_data.age))
println(first(age_groups, 10))
println("\nSex categories:")
sex_categories = unique(filter(!ismissing, pop_data.sex))
println(sex_categories)
end
Step 3: Calculate Total Population by Country
# Filter for total population (all ages, both sexes)
total_pop = filter(row ->
!ismissing(row.age) && row.age == "TOTAL" &&
!ismissing(row.sex) && row.sex == "T" &&
!ismissing(row.value), pop_data)
if nrow(total_pop) > 0
# Sort by population size
sort!(total_pop, :value, rev=true)
println("Top 10 countries by population:")
for i in 1:min(10, nrow(total_pop))
row = total_pop[i, :]
country = row.geo
population = round(Int, row.value)
println("$i. $country: $(format_number(population))")
end
end
# Helper function to format large numbers
function format_number(n)
if n >= 1_000_000
return "$(round(n/1_000_000, digits=1))M"
elseif n >= 1_000
return "$(round(n/1_000, digits=1))K"
else
return string(n)
end
end
Step 4: Age Distribution Analysis
function analyze_age_distribution(data, country_code)
country_data = filter(row ->
!ismissing(row.geo) && row.geo == country_code &&
!ismissing(row.sex) && row.sex == "T" && # Total (both sexes)
!ismissing(row.age) && row.age != "TOTAL" &&
!ismissing(row.value), data)
if nrow(country_data) == 0
println("No age distribution data for $country_code")
return nothing
end
sort!(country_data, :value, rev=true)
println("Age distribution for $country_code:")
for row in first(eachrow(country_data), 10)
age_group = row.age
population = round(Int, row.value)
println(" $age_group: $(format_number(population))")
end
return country_data
end
# Analyze age distribution for Germany
if nrow(total_pop) > 0
de_ages = analyze_age_distribution(pop_data, "DE")
end
Tutorial 4: Environmental Data Analysis
Let's explore environmental statistics, focusing on greenhouse gas emissions.
Step 1: Fetch Environmental Data
# Greenhouse gas emissions data
env_data = fetch_dataset("env_air_gge", 2021)
println("Environmental dataset dimensions: $(size(env_data))")
Step 2: Explore Emission Sources
# Check what emission sources are available
if :src_crf in names(env_data)
emission_sources = unique(filter(!ismissing, env_data.src_crf))
println("Available emission sources:")
for source in first(emission_sources, 15)
count = nrow(filter(row -> row.src_crf == source, env_data))
println(" $source: $count records")
end
end
Step 3: Total Emissions by Country
# Filter for total greenhouse gas emissions
total_emissions = filter(row ->
!ismissing(row.src_crf) &&
occursin("TOTAL", row.src_crf) &&
!ismissing(row.value), env_data)
if nrow(total_emissions) > 0
# Group by country
country_emissions = combine(groupby(total_emissions, :geo),
:value => sum => :total_emissions)
sort!(country_emissions, :total_emissions, rev=true)
println("Top emitters (CO2 equivalent):")
first(country_emissions, 10)
end
Tutorial 5: Data Quality Assessment
Learn how to assess and handle data quality issues.
Step 1: Check for Missing Data
function assess_data_quality(df, dataset_name)
println("Data Quality Assessment for $dataset_name")
println("=" ^ (30 + length(dataset_name)))
# Basic statistics
println("Total records: $(nrow(df))")
println("Total columns: $(ncol(df))")
# Missing values analysis
println("\nMissing Values by Column:")
for col in names(df)
missing_count = sum(ismissing.(df[!, col]))
missing_pct = round(missing_count / nrow(df) * 100, digits=2)
println(" $col: $missing_count ($missing_pct%)")
end
# Special values analysis
if :original_value in names(df)
special_df = filter(row -> !ismissing(row.original_value), df)
if nrow(special_df) > 0
println("\nSpecial Value Codes:")
for code in unique(special_df.original_value)
count = sum(special_df.original_value .== code)
println(" '$code': $count occurrences")
end
end
end
# Actual data coverage
actual_data = filter(row -> !ismissing(row.value), df)
coverage_pct = round(nrow(actual_data) / nrow(df) * 100, digits=2)
println("\nData Coverage: $(nrow(actual_data)) records ($coverage_pct%)")
return actual_data
end
# Example usage
if @isdefined(gdp_data)
clean_gdp = assess_data_quality(gdp_data, "GDP Data")
end
Step 2: Data Validation
function validate_numeric_data(df, value_col=:value)
issues = String[]
if value_col in names(df)
values = filter(!ismissing, df[!, value_col])
if length(values) == 0
push!(issues, "No numeric values found")
else
# Check for extreme values
if any(v -> v < 0, values)
negative_count = sum(v -> v < 0, values)
push!(issues, "$negative_count negative values found")
end
# Check for very large values (might indicate data errors)
if any(v -> v > 1e12, values)
extreme_count = sum(v -> v > 1e12, values)
push!(issues, "$extreme_count extremely large values (>1e12)")
end
# Check for unrealistic precision
if any(v -> v != round(v, digits=2), values)
high_precision = sum(v -> v != round(v, digits=2), values)
if high_precision > length(values) * 0.1 # More than 10%
push!(issues, "High precision values detected (might indicate raw data)")
end
end
end
else
push!(issues, "Value column '$value_col' not found")
end
# Report results
if isempty(issues)
println("✓ Data validation passed")
return true
else
println("⚠ Data validation issues:")
for issue in issues
println(" - $issue")
end
return false
end
end
Tutorial 6: Creating Reusable Analysis Functions
Build functions that you can reuse across different datasets and analyses.
Step 1: Generic Data Fetcher
function fetch_eurostat_robust(dataset_id, years; max_retries=3)
"""
Robustly fetch Eurostat data for multiple years with error handling.
"""
results = Dict{Int, DataFrame}()
errors = Dict{Int, Exception}()
for year in years
success = false
for attempt in 1:max_retries
try
df = fetch_dataset(dataset_id, year)
results[year] = df
println("✓ $dataset_id ($year): $(nrow(df)) records")
success = true
break
catch e
if attempt < max_retries
println("⚠ $dataset_id ($year) attempt $attempt failed, retrying...")
sleep(2)
else
errors[year] = e
println("✗ $dataset_id ($year): All attempts failed")
end
end
end
# Rate limiting between requests
if success
sleep(1)
end
end
return results, errors
end
Step 2: Flexible Analysis Function
function analyze_by_dimension(data, group_col, value_col=:value;
top_n=10, sort_desc=true)
"""
Generic function to analyze data by any dimension.
"""
if !(group_col in names(data)) || !(value_col in names(data))
error("Required columns not found in data")
end
# Filter out missing values
clean_data = filter(row ->
!ismissing(row[group_col]) && !ismissing(row[value_col]), data)
if nrow(clean_data) == 0
println("No valid data for analysis")
return nothing
end
# Calculate statistics by group
stats = combine(groupby(clean_data, group_col),
value_col => mean => :mean_value,
value_col => median => :median_value,
value_col => std => :std_value,
value_col => length => :count,
value_col => minimum => :min_value,
value_col => maximum => :max_value)
# Sort results
sort!(stats, :mean_value, rev=sort_desc)
# Display top results
println("Analysis by $group_col (top $top_n):")
println("-" ^ 50)
display_data = first(stats, min(top_n, nrow(stats)))
for row in eachrow(display_data)
group_val = row[group_col]
mean_val = round(row.mean_value, digits=2)
count = row.count
println("$group_val: Mean=$mean_val (n=$count)")
end
return stats
end
# Example usage
if @isdefined(gdp_timeseries)
country_analysis = analyze_by_dimension(gdp_timeseries, :geo)
end
Step 3: Automated Report Generator
function generate_dataset_report(dataset_id, year; output_file=nothing)
"""
Generate a comprehensive report for any Eurostat dataset.
"""
println("Generating report for $dataset_id ($year)")
println("=" ^ 50)
# Fetch data
try
df = fetch_dataset(dataset_id, year)
catch e
println("Failed to fetch data: $e")
return nothing
end
# Basic information
println("\n1. Dataset Overview")
println(" Records: $(nrow(df))")
println(" Columns: $(ncol(df))")
# Data quality
actual_data = filter(row -> !ismissing(row.value), df)
coverage = round(nrow(actual_data) / nrow(df) * 100, digits=1)
println(" Data coverage: $coverage%")
# Dimensions analysis
metadata_cols = [:dataset, :year, :value, :original_value, :fetch_date, :original_key]
dimension_cols = filter(col -> !(col in metadata_cols), names(df))
println("\n2. Dimensions Analysis")
for col in dimension_cols
unique_count = length(unique(skipmissing(df[!, col])))
println(" $col: $unique_count categories")
end
# Value statistics
if nrow(actual_data) > 0
values = actual_data.value
println("\n3. Value Statistics")
println(" Min: $(minimum(values))")
println(" Max: $(maximum(values))")
println(" Mean: $(round(mean(values), digits=2))")
println(" Median: $(round(median(values), digits=2))")
end
# Top categories for each dimension
println("\n4. Top Categories by Dimension")
for col in dimension_cols[1:min(3, length(dimension_cols))] # Limit to first 3
println(" $col:")
col_stats = analyze_by_dimension(actual_data, col, top_n=5)
end
return df
end
# Example usage
# report_data = generate_dataset_report("nama_10_gdp", 2022)
These tutorials provide a solid foundation for working with EurostatAPI.jl across different types of analyses and datasets. Each tutorial builds upon the previous ones, showing progressively more advanced techniques for data fetching, analysis, and quality assessment.