Children in low income families - exploration

Author

Jolyon Miles-Wilson

Published

June 9, 2025

Code
import pandas as pd
import requests
import os
import janitor
import numpy as np
import geopandas as gpd
import matplotlib as mpl
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import matplotlib.pyplot as plt
import seaborn as sns
import math
Code
# Save to pickle as quicker to load
data_path =  os.path.join('..','data')
pickle_path = os.path.join(data_path,'relative.pkl')
if os.path.isfile(pickle_path)==False:
    url = 'https://assets.publishing.service.gov.uk/media/67dc2c58c5528de3aa6711f9/children-in-low-income-families-local-area-statistics-2014-to-2024.ods'

    filename = url.split("/")[-1]

    path = os.path.join('..', 'data',filename)
    if os.path.isfile(path)==False:
        req = requests.get(url)
        with open(path, 'wb') as output_file:
            output_file.write(req.content)
    else:
        print('Data already acquired. Loading it')

    relative = pd.read_excel(path, sheet_name='7_Relative_Ward', skiprows=9)
    absolute = pd.read_excel(path, sheet_name='8_Absolute_Ward', skiprows=9)
    relative.to_pickle(os.path.join(data_path,'relative.pkl'))
    absolute.to_pickle(os.path.join(data_path,'absolute.pkl'))
else:
    relative = pd.read_pickle(os.path.join(data_path,'relative.pkl'))
    absolute = pd.read_pickle(os.path.join(data_path,'absolute.pkl'))
Code
# Clean data

# Drop the UK row
relative = relative.iloc[1:]
absolute = absolute.iloc[1:]

relative = janitor.clean_names(relative)
absolute = janitor.clean_names(absolute)


# Remove notes in colnames
relative.columns = (
    relative.columns
    .str.replace(r'_\[.*$', '', regex=True) # Remove _[ to end
    .str.replace(r'\[p\]', '', regex=True)   # Remove [p]
    .str.replace(r'_$', '', regex=True)  # Remove trailing underscore,
    .str.replace(r'_%$', '', regex=True) # Remove percentage sign at end
)

absolute.columns = (
    absolute.columns
    .str.replace(r'_\[.*$', '', regex=True) # Remove _[ to end
    .str.replace(r'\[p\]', '', regex=True)   # Remove [p]
    .str.replace(r'_$', '', regex=True)  # Remove trailing underscore
    .str.replace(r'_%$', '', regex=True) # Remove percentage sign at end
)

# Drop the number columns
relative = relative.drop(columns=[col for col in relative.columns if 'number' in col])
absolute = absolute.drop(columns=[col for col in absolute.columns if 'number' in col])

# Rename columns to just year
relative.columns = (
    relative.columns
    .str.replace(r'^percentage.*?(?=fye)', '', regex=True) # Remove 'percentage' up to but not including 'fye'
)

absolute.columns = (
    absolute.columns
    .str.replace(r'^percentage.*?(?=fye)', '', regex=True) # Remove 'percentage' up to but not including 'fye'
)

# Drop text in numeric columns
relative.replace(['[x]'], np.nan, inplace=True)
absolute.replace(['[x]'], np.nan, inplace=True)

# Loop through columns and create decile columns
columns = [col for col in relative.columns if col.startswith('fye_')]

for col in columns:
    decile_col = col + '_decile'
    relative_deciles = pd.qcut(relative[col], q=10, labels=False, duplicates='drop') + 1  # Deciles 1 to 10
    relative[decile_col] = 11 - relative_deciles # reverse so 1 is worst
    absolute_deciles = pd.qcut(absolute[col], q=10, labels=False, duplicates='drop') + 1  # Deciles 1 to 10
    absolute[decile_col] = 11 - absolute_deciles # reverse

# Subset to haringey
haringey_relative = relative.loc[relative['local_authority'] == 'Haringey']
haringey_absolute = absolute.loc[absolute['local_authority'] == 'Haringey']

# Get some averages from national picture
relative_decile_1_2015_mean = relative.loc[relative['fye_2015_decile']==1,'fye_2015'].mean()
relative_decile_10_2015_mean = relative.loc[relative['fye_2015_decile']==10,'fye_2015'].mean()

absolute_decile_1_2015_mean = absolute.loc[absolute['fye_2015_decile']==1,'fye_2015'].mean()
absolute_decile_10_2015_mean = absolute.loc[absolute['fye_2015_decile']==10,'fye_2015'].mean()
C:\Users\Jolyon\AppData\Local\Temp\ipykernel_46168\4279449845.py:44: FutureWarning:

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

C:\Users\Jolyon\AppData\Local\Temp\ipykernel_46168\4279449845.py:45: FutureWarning:

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
Code
# Get a list of wards to request from geoportal API
haringey_wards = haringey_relative['area_code_1'].unique().tolist()

# regions
base_url = 'https://services1.arcgis.com/ESMARspQHYMw9BZ9/ArcGIS/rest/services/Wards_December_2022_Boundaries_UK_BFC/FeatureServer/0/query'

where_clause = "WD22CD IN ('" + "', '".join(haringey_wards) + "')"

# Parameters
params = {
    "where": where_clause,  # Retrieve all records
    "outFields": "*",  # "*" specifies all fields
    "outSR": "4326",
    "f": "geojson",
    "resultOffset": 0,  # For pagination
    "resultRecordCount": 2000
}

# Send request
response = requests.get(base_url, params=params)

# Read the response content as geopandas df
haringey_boundaries = gpd.read_file(response.content)
haringey_boundaries = janitor.clean_names(haringey_boundaries)

# Merge the data with the geometry
haringey_relative_gpd = haringey_boundaries.merge(haringey_relative, how='right', left_on='wd22cd', right_on='area_code_1')
haringey_absolute_gpd = haringey_boundaries.merge(haringey_absolute, how='right', left_on='wd22cd', right_on='area_code_1')

Plots

Relative

2024

Code
cmap = mpl.colormaps['viridis'].reversed()
fig, ax = plt.subplots(figsize=[10,10])

haringey_relative_gpd.plot(ax=ax, column='fye_2024_decile', legend=True, cmap=cmap,legend_kwds={'label': "Decile (where 1 represents the 10% of places nationally with the highest number)"}, vmin = 1, vmax = 10,
        edgecolor='black',
        linewidth=0.1)

# Add ward labels
for idx, row in haringey_relative_gpd.iterrows():
    # Use the centroid of the polygon for label position
    if row['geometry'].geom_type == 'Polygon' or row['geometry'].geom_type == 'MultiPolygon':
        centroid = row['geometry'].centroid
        x, y = centroid.x, centroid.y
        if row['wd22nm'] == 'Bruce Castle':
            y += 0.001

        ax.text(
            x, y,
            row['wd22nm'],  # Replace with your ward name column
            fontsize=8,
            ha='center',
            va='center',
            color='black',
            weight='bold'
        )
        
ax.set_title('Children living in relative poverty in Haringey: 2024')
ax.set_axis_off()

Across years

Code
# Example list of years
years = list(range(2015, 2025))
columns = [f'fye_{year}_decile' for year in years]

# Set up subplot grid: adjust rows and cols as needed
n_cols = 3
n_rows = math.ceil(len(columns) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(3 * n_cols, 3 * n_rows))

# Flatten axes for easy iteration
axes = axes.flatten()

for i, (col, ax) in enumerate(zip(columns, axes)):
    haringey_relative_gpd.plot(
        ax=ax,
        column=col,
        cmap=cmap,
        legend=True,
        legend_kwds={'label': "Decile (1 = highest 10% nationally)", 'shrink': 0.5},
        vmin=1, vmax=10,
        edgecolor='black',
        linewidth=0.1
    )
    
    ax.set_title(f'{years[i]}')
    ax.set_axis_off()

# Hide any unused subplots if number of columns doesn't fill grid
for j in range(len(columns), len(axes)):
    fig.delaxes(axes[j])

fig.suptitle('Children in relative poverty by year: Deciles', size = 20)

plt.tight_layout()
plt.show()

Percentage

Code
cmap = mpl.colormaps['viridis']
# raw numbers
years = list(range(2015, 2025))
columns = [f'fye_{year}' for year in years]

max_number = math.ceil(haringey_relative_gpd[columns].max().max()/.05) * .05
min_number = math.floor(haringey_relative_gpd[columns].min().min()/.05) * .05

# Set up subplot grid: adjust rows and cols as needed
n_cols = 3
n_rows = math.ceil(len(columns) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=( 3 * n_cols, 3 * n_rows))

# Flatten axes for easy iteration
axes = axes.flatten()

for i, (col, ax) in enumerate(zip(columns, axes)):
    haringey_relative_gpd.plot(
        ax=ax,
        column=col,
        cmap=cmap,
        legend=True,
        legend_kwds={'label': "Percentage of children", 'shrink': 0.5},
        vmin = min_number,
        vmax = max_number
    )
    ax.set_title(f'{years[i]}')
    ax.set_axis_off()

# Hide any unused subplots if number of columns doesn't fill grid
for j in range(len(columns), len(axes)):
    fig.delaxes(axes[j])

fig.suptitle('Children in relative poverty by year: Percentage', size = 20)

plt.tight_layout()
plt.show()

Absolute

2024

Code
cmap = mpl.colormaps['viridis'].reversed()
fig, ax = plt.subplots(figsize=[10,10])

haringey_absolute_gpd.plot(ax=ax, column='fye_2024_decile', legend=True, cmap=cmap,legend_kwds={'label': "Decile (where 1 represents the 10% of places nationally with the highest number)"}, vmin = 1, vmax = 10,
        edgecolor='black',
        linewidth=0.1)

# Add ward labels
for idx, row in haringey_absolute_gpd.iterrows():
    # Use the centroid of the polygon for label position
    if row['geometry'].geom_type == 'Polygon' or row['geometry'].geom_type == 'MultiPolygon':
        centroid = row['geometry'].centroid
        x, y = centroid.x, centroid.y
        if row['wd22nm'] == 'Bruce Castle':
            y += 0.001

        ax.text(
            x, y,
            row['wd22nm'],  # Replace with your ward name column
            fontsize=8,
            ha='center',
            va='center',
            color='black',
            weight='bold'
        )
        
ax.set_title('Children living in absolute poverty in Haringey: 2024')
ax.set_axis_off()

Across years

Code
# Example list of years
years = list(range(2015, 2025))
columns = [f'fye_{year}_decile' for year in years]

# Set up subplot grid: adjust rows and cols as needed
n_cols = 3
n_rows = math.ceil(len(columns) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(3 * n_cols, 3 * n_rows))

# Flatten axes for easy iteration
axes = axes.flatten()

for i, (col, ax) in enumerate(zip(columns, axes)):
    haringey_absolute_gpd.plot(
        ax=ax,
        column=col,
        cmap=cmap,
        legend=True,
        legend_kwds={'label': "Decile (1 = highest 10% nationally)", 'shrink': 0.5},
        vmin=1, vmax=10,
        edgecolor='black',
        linewidth=0.1
    )
    
    ax.set_title(f'{years[i]}')
    ax.set_axis_off()

# Hide any unused subplots if number of columns doesn't fill grid
for j in range(len(columns), len(axes)):
    fig.delaxes(axes[j])

fig.suptitle('Children in absolute poverty by year: Deciles', size = 20)

plt.tight_layout()
plt.show()

Percentage

Code
cmap = mpl.colormaps['viridis']
# raw numbers
years = list(range(2015, 2025))
columns = [f'fye_{year}' for year in years]

max_number = math.ceil(haringey_absolute_gpd[columns].max().max()/.05) * .05
min_number = math.floor(haringey_absolute_gpd[columns].min().min()/.05) * .05

# Set up subplot grid: adjust rows and cols as needed
n_cols = 3
n_rows = math.ceil(len(columns) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=( 3 * n_cols, 3 * n_rows))

# Flatten axes for easy iteration
axes = axes.flatten()

for i, (col, ax) in enumerate(zip(columns, axes)):
    haringey_absolute_gpd.plot(
        ax=ax,
        column=col,
        cmap=cmap,
        legend=True,
        legend_kwds={'label': "Percentage of children", 'shrink': 0.5},
        vmin = min_number,
        vmax = max_number
    )
    ax.set_title(f'{years[i]}')
    ax.set_axis_off()

# Hide any unused subplots if number of columns doesn't fill grid
for j in range(len(columns), len(axes)):
    fig.delaxes(axes[j])

fig.suptitle('Children in absolute poverty by year: Percentage', size = 20)

plt.tight_layout()
plt.show()

Line plots

Relative poverty

Code
columns = [col for col in relative.columns if col.startswith('fye_') and not col.endswith('decile')]
df_sub = haringey_relative[['ward'] + columns]
df_long = df_sub.melt(id_vars='ward', var_name='year', value_name='perc')
df_long['year'] = df_long['year'].str.extract(r'(\d+)', expand=False).astype(int) # r tells regex not to treat backslash as escape

# Create a flag for Northumberland Park.
df_long['np'] = (df_long['ward'] == "Northumberland Park").astype(int)

# Create custom color palette so that NP is black and the rest are grey
unique_wards = df_long['ward'].unique()
colors = ['black' if ward == "Northumberland Park" else 'lightgrey' for ward in unique_wards]
color_palette = dict(zip(unique_wards, colors))

# Create the plot
plt.figure(figsize=(8, 6))
plot = sns.lineplot(data=df_long, x='year', y='perc', hue='ward', marker='o', legend=False,palette=color_palette)

# Collect final points and colors
endpoints = []
for ward, group in df_long.groupby('ward'):
    group_sorted = group.sort_values('year')
    x = group_sorted['year'].values[-1]
    y = group_sorted['perc'].values[-1]
    line = plot.lines[df_long['ward'].unique().tolist().index(ward)]
    color = line.get_color()
    endpoints.append({'ward': ward, 'x': x, 'y': y, 'color': color})

# Sort to prevent overlap
endpoints = sorted(endpoints, key=lambda d: d['y'])
min_spacing = .005
adjusted_ys = []

for i, point in enumerate(endpoints):
    y = point['y']
    if i > 0:
        prev_y = adjusted_ys[-1]
        if y - prev_y < min_spacing:
            y = prev_y + min_spacing
    adjusted_ys.append(y)

# # Plot labels and connecting lines
for point, new_y in zip(endpoints, adjusted_ys):
    label_x = point['x'] + 0.3  # Label offset to the right
    label_y = new_y

    # Draw a line from data point to label
    plt.plot([point['x'], label_x], [point['y'], label_y],
             color=point['color'], linewidth=1, linestyle='--', alpha=0.7)

    # Add the label
    plt.text(label_x, label_y, point['ward'],
             color=point['color'], va='center', fontsize=10)

plt.axhline(y=relative_decile_1_2015_mean, color='red', linestyle='--', linewidth=1)
plt.axhline(y=relative_decile_10_2015_mean, color='red', linestyle='--', linewidth=1)

plt.text(
    x=df_long['year'].max() + 1,  # Just beyond the last year
    y=relative_decile_1_2015_mean,
    s='Decile 1 average',
    va='center',
    ha='left',
    fontsize=9,
    color='red'
)

plt.text(
    x=df_long['year'].max() + 1,  # Just beyond the last year
    y=relative_decile_10_2015_mean,
    s='Decile 10 average',
    va='center',
    ha='left',
    fontsize=9,
    color='red'
)

plt.title('Children in relative poverty by ward over time: Haringey')
plt.ylabel('Percentage of children')
plt.xlabel('Year')
plt.grid(True)
plt.tight_layout()
# Ensure ticks for every year from min to max
years = np.arange(df_long['year'].min(), df_long['year'].max() + 1)
plt.xticks(years)
plt.show()

Absolute poverty

Code
columns = [col for col in absolute.columns if col.startswith('fye_') and not col.endswith('decile')]
df_sub = haringey_absolute[['ward'] + columns]
df_long = df_sub.melt(id_vars='ward', var_name='year', value_name='perc')
df_long['year'] = df_long['year'].str.extract(r'(\d+)', expand=False).astype(int) # r tells regex not to treat backslash as escape

# Create a flag for Northumberland Park.
df_long['np'] = (df_long['ward'] == "Northumberland Park").astype(int)

# Create custom color palette so that NP is black and the rest are grey
unique_wards = df_long['ward'].unique()
colors = ['black' if ward == "Northumberland Park" else 'lightgrey' for ward in unique_wards]
color_palette = dict(zip(unique_wards, colors))

# Create the plot
plt.figure(figsize=(8, 6))
plot = sns.lineplot(data=df_long, x='year', y='perc', hue='ward', marker='o', legend=False,palette=color_palette)

# Collect final points and colors
endpoints = []
for ward, group in df_long.groupby('ward'):
    group_sorted = group.sort_values('year')
    x = group_sorted['year'].values[-1]
    y = group_sorted['perc'].values[-1]
    line = plot.lines[df_long['ward'].unique().tolist().index(ward)]
    color = line.get_color()
    endpoints.append({'ward': ward, 'x': x, 'y': y, 'color': color})

# Sort to prevent overlap
endpoints = sorted(endpoints, key=lambda d: d['y'])
min_spacing = .005
adjusted_ys = []

for i, point in enumerate(endpoints):
    y = point['y']
    if i > 0:
        prev_y = adjusted_ys[-1]
        if y - prev_y < min_spacing:
            y = prev_y + min_spacing
    adjusted_ys.append(y)

# # Plot labels and connecting lines
for point, new_y in zip(endpoints, adjusted_ys):
    label_x = point['x'] + 0.3  # Label offset to the right
    label_y = new_y

    # Draw a line from data point to label
    plt.plot([point['x'], label_x], [point['y'], label_y],
             color=point['color'], linewidth=1, linestyle='--', alpha=0.7)

    # Add the label
    plt.text(label_x, label_y, point['ward'],
             color=point['color'], va='center', fontsize=10)

plt.axhline(y=absolute_decile_1_2015_mean, color='red', linestyle='--', linewidth=1)
plt.axhline(y=absolute_decile_10_2015_mean, color='red', linestyle='--', linewidth=1)

plt.text(
    x=df_long['year'].max() + .75,  # Just beyond the last year
    y=absolute_decile_1_2015_mean,
    s='Decile 1 average',
    va='center',
    ha='left',
    fontsize=9,
    color='red'
)

plt.text(
    x=df_long['year'].max() + .75,  # Just beyond the last year
    y=absolute_decile_10_2015_mean,
    s='Decile 10 average',
    va='center',
    ha='left',
    fontsize=9,
    color='red'
)

plt.title('Children in absolute poverty by ward over time: Haringey')
plt.ylabel('Percentage of children')
plt.xlabel('Year')
plt.grid(True)
plt.tight_layout()
# Ensure ticks for every year from min to max
years = np.arange(df_long['year'].min(), df_long['year'].max() + 1)
plt.xticks(years)
plt.show()