Examples
This page provides comprehensive examples of using EurostatAPI.jl to access and analyze Eurostat data.
Basic Usage Examples
Fetching a Simple Dataset
using EurostatAPI
using DataFrames
# Fetch European GDP data for 2022
df = fetch_dataset("nama_10_gdp", 2022)
# Examine the structure
println("Dataset dimensions: $(size(df))")
println("Column names: $(names(df))")
# Show first few rows
first(df, 5)
Exploring Dataset Structure
# Check what dimensions are available
dimension_cols = filter(name -> !(name in [:dataset, :year, :value, :original_value, :fetch_date, :original_key]), names(df))
println("Available dimensions: ", dimension_cols)
# Count unique values in each dimension
for col in dimension_cols
unique_count = length(unique(skipmissing(df[!, col])))
println("$col: $unique_count unique values")
end
Working with Different Dataset Types
National Accounts Data
# GDP and main components
gdp_data = fetch_dataset("nama_10_gdp", 2022)
# Filter for GDP at market prices for EU countries
if :na_item in names(gdp_data) && :geo in names(gdp_data)
eu_gdp = filter(row ->
!ismissing(row.na_item) &&
occursin("B1GQ", row.na_item) && # GDP at market prices
!ismissing(row.geo) &&
length(row.geo) == 2, # Country codes are 2 letters
gdp_data)
println("EU GDP data: $(nrow(eu_gdp)) records")
end
Population Data
# Population on 1 January by age and sex
pop_data = fetch_dataset("demo_pjan", 2023)
# Examine population structure
if :age in names(pop_data) && :sex in names(pop_data)
# Total population by country
total_pop = filter(row ->
!ismissing(row.age) && row.age == "TOTAL" &&
!ismissing(row.sex) && row.sex == "T", # Total (both sexes)
pop_data)
println("Total population records: $(nrow(total_pop))")
end
Environmental Data
# Greenhouse gas emissions
emission_data = fetch_dataset("env_air_gge", 2021)
# Air quality data
if haskey(ENV, "FETCH_AIR_QUALITY") # Optional example
air_data = fetch_dataset("env_air_qa", 2022)
end
Time Series Analysis
Multi-Year Data Collection
function collect_time_series(dataset_id, years)
all_data = DataFrame()
successful_years = Int[]
for year in years
try
yearly_data = fetch_dataset(dataset_id, year)
if nrow(yearly_data) > 0
append!(all_data, yearly_data)
push!(successful_years, year)
println("✓ Year $year: $(nrow(yearly_data)) records")
else
println("⚠ Year $year: No data available")
end
catch e
println("✗ Year $year: Error - $e")
end
# Be respectful to the API
sleep(1)
end
println("\nSummary:")
println("Successfully collected: $(length(successful_years)) years")
println("Total records: $(nrow(all_data))")
println("Years: $(successful_years)")
return all_data, successful_years
end
# Collect GDP data for recent years
gdp_timeseries, years_collected = collect_time_series("nama_10_gdp", 2019:2023)
Analyzing Trends
using Statistics, StatsBase
function analyze_country_trends(data, country_code, indicator)
# Filter for specific country and indicator
country_data = filter(row ->
!ismissing(row.geo) && row.geo == country_code &&
!ismissing(row.na_item) && occursin(indicator, row.na_item) &&
!ismissing(row.value),
data)
if nrow(country_data) == 0
println("No data found for $country_code with indicator $indicator")
return nothing
end
# Sort by year
sort!(country_data, :year)
# Calculate year-over-year growth
if nrow(country_data) > 1
values = country_data.value
years = country_data.year
growth_rates = []
for i in 2:length(values)
if values[i-1] != 0
growth = (values[i] - values[i-1]) / values[i-1] * 100
push!(growth_rates, growth)
end
end
println("$country_code GDP trends:")
println("Years: $(minimum(years))-$(maximum(years))")
println("Average growth: $(round(mean(growth_rates), digits=2))%")
println("Growth volatility (std): $(round(std(growth_rates), digits=2))%")
return country_data, growth_rates
end
return country_data, nothing
end
# Example analysis for Germany
if nrow(gdp_timeseries) > 0
de_analysis = analyze_country_trends(gdp_timeseries, "DE", "B1GQ")
end
Data Quality and Validation
Checking for Missing Data
function data_quality_report(df)
println("Data Quality Report")
println("==================")
# Overall statistics
println("Total records: $(nrow(df))")
println("Total columns: $(ncol(df))")
# Missing value analysis
println("\nMissing Value Analysis:")
for col in names(df)
missing_count = sum(ismissing.(df[!, col]))
missing_pct = round(missing_count / nrow(df) * 100, digits=2)
println("$col: $missing_count missing ($(missing_pct)%)")
end
# Special value analysis
if :original_value in names(df)
special_values = filter(row -> !ismissing(row.original_value), df)
if nrow(special_values) > 0
println("\nSpecial Values:")
value_counts = countmap(special_values.original_value)
for (val, count) in sort(collect(value_counts), by=x->x[2], rev=true)
println("'$val': $count occurrences")
end
end
end
# Actual data records
actual_data = filter(row -> !ismissing(row.value), df)
println("\nActual data records: $(nrow(actual_data)) ($(round(nrow(actual_data)/nrow(df)*100, digits=2))%)")
return actual_data
end
# Run quality report
if @isdefined(gdp_data)
clean_data = data_quality_report(gdp_data)
end
Data Validation
function validate_eurostat_data(df)
issues = String[]
# Check required columns
required_cols = [:dataset, :year, :value, :fetch_date]
for col in required_cols
if !(col in names(df))
push!(issues, "Missing required column: $col")
end
end
# Check data types
if :year in names(df) && !all(x -> ismissing(x) || isa(x, Integer), df.year)
push!(issues, "Year column contains non-integer values")
end
# Check for completely empty dataset
if nrow(df) == 0
push!(issues, "Dataset is empty")
end
# Check for reasonable value ranges
if :value in names(df)
numeric_values = filter(!ismissing, df.value)
if length(numeric_values) > 0
if any(x -> isa(x, Number) && (x < -1e12 || x > 1e12), numeric_values)
push!(issues, "Some values are outside reasonable range")
end
end
end
# Report results
if isempty(issues)
println("✓ Data validation passed")
return true
else
println("⚠ Data validation issues found:")
for issue in issues
println(" - $issue")
end
return false
end
end
Advanced Analysis Examples
Cross-Country Comparisons
function compare_countries(data, indicator, countries, year_range=nothing)
# Filter for specific indicator
indicator_data = filter(row ->
!ismissing(row.na_item) && occursin(indicator, row.na_item) &&
!ismissing(row.geo) && row.geo in countries &&
!ismissing(row.value),
data)
if nrow(indicator_data) == 0
println("No data found for comparison")
return nothing
end
# Apply year filter if specified
if year_range !== nothing
indicator_data = filter(row -> row.year in year_range, indicator_data)
end
# Calculate statistics by country
country_stats = combine(groupby(indicator_data, :geo),
:value => mean => :avg_value,
:value => std => :std_value,
:value => length => :count,
:year => (x -> (minimum(x), maximum(x))) => :year_range)
sort!(country_stats, :avg_value, rev=true)
println("Country Comparison - $indicator")
println("="^50)
for row in eachrow(country_stats)
println("$(row.geo): Avg = $(round(row.avg_value, digits=2)), " *
"Std = $(round(row.std_value, digits=2)), " *
"Years: $(row.year_range[1])-$(row.year_range[2]) ($(row.count) obs)")
end
return country_stats
end
# Compare GDP across major EU economies
if @isdefined(gdp_timeseries)
major_eu = ["DE", "FR", "IT", "ES", "NL", "PL"]
comparison = compare_countries(gdp_timeseries, "B1GQ", major_eu)
end
Regional Analysis
function analyze_by_region(data)
# Define regional groupings (simplified)
regions = Dict(
"Western Europe" => ["DE", "FR", "IT", "ES", "NL", "BE", "AT", "CH"],
"Northern Europe" => ["SE", "DK", "FI", "NO", "IS"],
"Eastern Europe" => ["PL", "CZ", "HU", "SK", "SI", "HR", "RO", "BG"],
"Southern Europe" => ["GR", "PT", "CY", "MT"]
)
regional_data = DataFrame()
for (region_name, countries) in regions
region_subset = filter(row ->
!ismissing(row.geo) && row.geo in countries &&
!ismissing(row.value),
data)
if nrow(region_subset) > 0
# Add region column
region_subset[!, :region] = region_name
append!(regional_data, region_subset)
end
end
if nrow(regional_data) > 0
# Regional statistics
regional_stats = combine(groupby(regional_data, [:region, :year]),
:value => mean => :avg_value,
:value => sum => :total_value,
:value => length => :country_count)
return regional_stats
end
return nothing
end
Error Handling and Robustness
Comprehensive Error Handling
function robust_fetch(dataset, year; max_retries=3, delay=5)
for attempt in 1:max_retries
try
println("Attempt $attempt for $dataset ($year)...")
df = fetch_dataset(dataset, year)
# Validate the result
if nrow(df) == 0
println("Warning: Empty dataset received")
return df # Return empty but valid DataFrame
end
println("Success: $(nrow(df)) records retrieved")
return df
catch e
println("Attempt $attempt failed: $e")
if attempt < max_retries
println("Waiting $(delay) seconds before retry...")
sleep(delay)
else
println("All attempts failed for $dataset ($year)")
rethrow(e)
end
end
end
end
# Usage with error handling
function safe_multi_fetch(datasets, year)
results = Dict{String, DataFrame}()
errors = Dict{String, Exception}()
for dataset in datasets
try
df = robust_fetch(dataset, year)
results[dataset] = df
catch e
errors[dataset] = e
end
end
println("\nSummary:")
println("Successful: $(length(results)) datasets")
println("Failed: $(length(errors)) datasets")
if !isempty(errors)
println("\nErrors:")
for (dataset, error) in errors
println("$dataset: $error")
end
end
return results, errors
end
# Example usage
test_datasets = ["nama_10_gdp", "demo_pjan", "env_air_gge"]
results, errors = safe_multi_fetch(test_datasets, 2022)
Monitoring and Logging
using Logging
function fetch_with_detailed_logging(dataset, year)
# Create a custom logger
logger = ConsoleLogger(stderr, Logging.Info)
with_logger(logger) do
@info "Starting fetch" dataset=dataset year=year timestamp=now()
start_time = time()
try
df = fetch_dataset(dataset, year)
elapsed = time() - start_time
@info "Fetch completed successfully" rows=nrow(df) columns=ncol(df) elapsed_seconds=round(elapsed, digits=2)
# Log data quality metrics
actual_values = count(!ismissing, df.value)
@info "Data quality" total_records=nrow(df) actual_values=actual_values missing_rate=round((nrow(df)-actual_values)/nrow(df)*100, digits=2)
return df
catch e
elapsed = time() - start_time
@error "Fetch failed" exception=e elapsed_seconds=round(elapsed, digits=2)
rethrow(e)
end
end
end
Integration Examples
Combining with Other Packages
# Integration with Plots.jl for visualization
using Plots
function plot_time_series(data, country, indicator)
country_data = filter(row ->
!ismissing(row.geo) && row.geo == country &&
!ismissing(row.na_item) && occursin(indicator, row.na_item) &&
!ismissing(row.value),
data)
if nrow(country_data) == 0
println("No data to plot")
return nothing
end
sort!(country_data, :year)
plot(country_data.year, country_data.value,
title="$indicator for $country",
xlabel="Year",
ylabel="Value",
marker=:circle,
linewidth=2)
end
# Integration with CSV.jl for data export
using CSV
function export_to_csv(data, filename)
# Clean up the data for export
export_data = select(data, Not([:fetch_date, :original_key]))
CSV.write(filename, export_data)
println("Data exported to $filename")
println("Exported $(nrow(export_data)) rows, $(ncol(export_data)) columns")
end
Batch Processing Pipeline
function eurostat_pipeline(datasets, years, output_dir="eurostat_data")
# Create output directory
if !isdir(output_dir)
mkdir(output_dir)
end
results_summary = DataFrame(
dataset = String[],
year = Int[],
status = String[],
records = Int[],
filename = String[]
)
for dataset in datasets
for year in years
try
# Fetch data
df = fetch_dataset(dataset, year)
# Generate filename
filename = joinpath(output_dir, "$(dataset)_$(year).csv")
# Export to CSV
CSV.write(filename, df)
# Record success
push!(results_summary, (dataset, year, "SUCCESS", nrow(df), filename))
println("✓ $dataset ($year): $(nrow(df)) records → $filename")
catch e
# Record failure
push!(results_summary, (dataset, year, "FAILED", 0, ""))
println("✗ $dataset ($year): $e")
end
# Rate limiting
sleep(2)
end
end
# Save summary
summary_file = joinpath(output_dir, "processing_summary.csv")
CSV.write(summary_file, results_summary)
println("\nPipeline completed. Summary saved to $summary_file")
return results_summary
end
# Example pipeline execution
# datasets_to_process = ["nama_10_gdp", "demo_pjan"]
# years_to_process = 2020:2022
# summary = eurostat_pipeline(datasets_to_process, years_to_process)
Performance Optimization Examples
Memory-Efficient Processing
function process_large_dataset_efficiently(dataset, year)
println("Processing $dataset ($year) with memory monitoring...")
# Monitor initial memory
initial_memory = Base.gc_bytes()
# Fetch data
df = fetch_dataset(dataset, year)
# Monitor peak memory
peak_memory = Base.gc_bytes()
# Process in chunks if large
if nrow(df) > 100_000
println("Large dataset detected. Processing in chunks...")
chunk_size = 50_000
processed_chunks = 0
for start_idx in 1:chunk_size:nrow(df)
end_idx = min(start_idx + chunk_size - 1, nrow(df))
chunk = df[start_idx:end_idx, :]
# Process chunk (example: just count non-missing values)
non_missing = count(!ismissing, chunk.value)
processed_chunks += 1
if processed_chunks % 5 == 0
println("Processed $processed_chunks chunks...")
# Force garbage collection periodically
GC.gc()
end
end
end
# Final memory check
final_memory = Base.gc_bytes()
println("Memory usage:")
println(" Initial: $(initial_memory ÷ 1024^2) MB")
println(" Peak: $(peak_memory ÷ 1024^2) MB")
println(" Final: $(final_memory ÷ 1024^2) MB")
return df
end
Filtering and Chunking Large Datasets
Using Filters to Reduce Response Size
The fetch_dataset
function now supports filtering to reduce data transfer and improve performance:
# Filter by geographic regions
df_germany = fetch_dataset("nama_10_gdp", 2022; geo=["DE"])
df_european = fetch_dataset("nama_10_gdp", 2022; geo=["EU27_2020", "DE", "FR", "IT", "ES"])
# Filter PRODCOM data by indicators and product codes
df_quantity = fetch_dataset("DS-056120", 2022; indicators=["PRODQNT", "QNTUNIT"])
df_products = fetch_dataset("DS-056120", 2022; prodcode=["10110000", "10120000", "10130000"])
# Combine multiple filters
df_filtered = fetch_dataset("DS-056120", 2022;
indicators=["PRODQNT"],
geo=["EU27_2020"],
prodcode=["10110000", "10120000"])
Automatic Fallback for Large Datasets
The fetch_with_fallback
function automatically handles 413 (Request Entity Too Large) errors:
# Automatically adds filters if response is too large
df = fetch_with_fallback("DS-056120", 2022)
# You can still specify your own filters
df_custom = fetch_with_fallback("DS-056120", 2022; geo=["DE", "FR"])
# Handle errors gracefully
try
df = fetch_dataset("DS-056120", 2022)
catch e
if isa(e, HTTP.StatusError) && e.status == 413
println("Dataset too large, using fallback...")
df = fetch_with_fallback("DS-056120", 2022)
else
rethrow(e)
end
end
Chunked Fetching for Very Large Datasets
For extremely large datasets, use fetch_dataset_chunked
:
# Fetch PRODCOM data in chunks of 100 product codes
df_chunked = fetch_dataset_chunked("DS-056120", 2022;
chunk_by=:prodcode,
chunk_size=100)
# Chunk by geographic regions
df_geo_chunks = fetch_dataset_chunked("nama_10_gdp", 2022;
chunk_by=:geo,
chunk_size=10)
# Combine chunking with filtering
df_filtered_chunks = fetch_dataset_chunked("DS-056120", 2022;
chunk_by=:prodcode,
chunk_size=50,
indicators=["PRODQNT"])
Getting Dataset Metadata
Query available dimensions before fetching data:
# Get metadata for a dataset
metadata = get_dataset_metadata("DS-056120")
println("Number of product codes: $(length(metadata.prodcodes))")
println("Number of geographic codes: $(length(metadata.geo_codes))")
println("Available indicators: $(metadata.indicators)")
# Use metadata to select specific codes
if !isempty(metadata.prodcodes)
# Get first 10 product codes
selected_codes = metadata.prodcodes[1:min(10, length(metadata.prodcodes))]
df = fetch_dataset("DS-056120", 2022; prodcode=selected_codes)
end
Smart Fetching Strategy
Combine metadata queries with appropriate fetch methods:
function fetch_prodcom_smart(year::Int; max_products::Int=100)
dataset = "DS-056120"
# First, get metadata to understand the dataset
metadata = get_dataset_metadata(dataset)
if isempty(metadata.prodcodes)
# No product codes found, use fallback
return fetch_with_fallback(dataset, year)
elseif length(metadata.prodcodes) > max_products
# Large dataset, use chunking
return fetch_dataset_chunked(dataset, year;
chunk_by=:prodcode,
chunk_size=max_products,
indicators=["PRODQNT", "QNTUNIT"])
else
# Small enough to fetch directly
return fetch_dataset(dataset, year;
indicators=["PRODQNT", "QNTUNIT"])
end
end
# Use the smart fetching function
df = fetch_prodcom_smart(2022)
Memory-Efficient Processing with Chunks
Process chunks as they arrive to minimize memory usage:
function process_prodcom_streaming(year::Int)
metadata = get_dataset_metadata("DS-056120")
chunk_size = 50
results = []
for i in 1:chunk_size:length(metadata.prodcodes)
chunk_codes = metadata.prodcodes[i:min(i+chunk_size-1, end)]
try
chunk_df = fetch_dataset("DS-056120", year; prodcode=chunk_codes)
# Process chunk immediately to save memory
summary = DataFrame(
prodcode = chunk_codes,
total_quantity = [sum(skipmissing(chunk_df.value[chunk_df.prodcode .== code]))
for code in chunk_codes]
)
push!(results, summary)
# Allow garbage collection between chunks
GC.gc()
catch e
@warn "Failed to process chunk: $e"
end
end
return vcat(results...)
end
These examples demonstrate the versatility and robustness of EurostatAPI.jl for various data analysis scenarios. The package handles the complexity of the Eurostat API while providing clean, analyzable data for your research and analysis needs.