Analysis: Report matched

Authors

Jolyon Miles-Wilson

Celestin Okoroji

Published

April 28, 2025

Code
library(haven)
library(poLCA)
library(Hmisc)
library(dplyr)
library(ggplot2)
library(tidyr)
library(skimr)
library(kableExtra)
#library(MASS)
library(wesanderson)
library(ggrepel)
library(here)
library(emmeans)
#library(devtools)
#install_version("sjstats", version = "0.18.2")
library(sjstats)
library(readr)
library(sjPlot)
library(nnet)
library(apaTables)
Code
rm(list = ls())
options(scipen = 999)
colours <- wes_palette("GrandBudapest2",4,"discrete")
better_colours <- c('#8dd3c7','#bebada','#fb8072','#80b1d3','#fdb462')
many_colours <- c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928','#8dd3c7','#ffffb3','#bebada','#fb8072','#80b1d3','#fdb462','#b3de69','#fccde5','#d9d9d9','#bc80bd','#ccebc5','#ffed6f')
Code
extract_glm_coefs <- function(mod, only_sig=F, decimal_places = 3){
  coefs <- coef(summary(mod)) 
  if(only_sig==T){
    coefs <- coefs[which(coefs[,4] < .05),]
  }
  coefs <- as_tibble(coefs, rownames="variable") %>% # specify new variable to add rownames to 
    mutate(
    or = round(exp(Estimate), decimal_places), .after=Estimate
    )
}

extract_lm_coefs <- function(mod, only_sig = F){
  coefs <- coef(summary(mod)) 
  if(only_sig==T){
    coefs <- coefs[which(coefs[,4] < .05),]
  }
  coefs <- as_tibble(coefs, rownames="variable") # specify new variable to add rownames to 
}

get_pvalue <- function(model){
  f_value <- summary(model)$fstatistic
  p_value <- pf(f_value['value'], f_value['numdf'], f_value['dendf'], lower.tail = F)
  attributes(p_value) <- NULL
  return(p_value)
}
Code
data <- readRDS("../Data/2025-04-07 - Cleaned_data.rds")

# Specify data to be used in income analysis
income_data <- filter(data, income_drop_all==0)

1 Outsourced workers are paid £2,951 less a year

“Overall, we find that outsourced workers are paid £2,951 less a year, and £63 less a week, than non-outsourced workers.”

Code
# Annual income
# Intercept only
mod_base <- lm(income_annual_all ~ 1, income_data, weights = NatRepemployees)
# H1
mod_annual <- lm(income_annual_all ~ Age + Gender + Has_Degree + Ethnicity_collapsed + Region + outsourcing_status + BORNUK_labelled, income_data, weights = NatRepemployees)

summary(mod_annual)

# F and p
f_annual <- round(anova(mod_base, mod_annual)[2,"F"],2)
p_annual <- anova(mod_base, mod_annual)[2,"Pr(>F)"]
if(p_annual < .001){
  p_annual = "< .001"
} else{
  p_annual = paste0("= ",round(p_annual,3))
}

# Degrees of freedom
dfs_annual <- as.list(anova(mod_base, mod_annual)[2,c("Df","Res.Df")])
# R2
rsquare_annual <- round(summary(mod_annual)$r.squared,2)

# Weekly income
# Intercept only
mod_base <- lm(income_weekly_all ~ 1, income_data, weights = NatRepemployees)
# H1
mod_weekly <- lm(income_weekly_all ~ Age + Gender + Has_Degree + Ethnicity_collapsed + Region + outsourcing_status + BORNUK_labelled, income_data, weights = NatRepemployees)
summary(mod_weekly)

# F and p
f_weekly <- round(anova(mod_base, mod_weekly)[2,"F"],2)
p_weekly <- anova(mod_base, mod_weekly)[2,"Pr(>F)"]
if(p_weekly < .001){
  p_weekly = "< .001"
} else{
  p_weekly = paste0("= ",round(p_weekly,3))
}

# Degrees of freedom
dfs_weekly <- as.list(anova(mod_base, mod_weekly)[2,c("Df","Res.Df")])
# R2
rsquare_weekly <- round(summary(mod_weekly)$r.squared,2)

The annual income model was statistically significant (R2 = 0.18, F(34, 8256) = 54.66, p < .001). The table below shows the coefficients for the annual income model.

Code
labels <- c(
  'Intercept',
  'Age',
  'Gender: Male',
  'Gender: Other',
  'Gender: Prefer not to say',
  'Education: Has degree',
  "Education: Don't know",
  'Ethnicity: Arab/British Arab',
  'Ethnicity: Asian/Asian British',
  'Ethnicity: Black/African/Caribbean/Black British',
  'Ethnicity: Mixed/Multiple ethnic group',
  'Ethnicity: Other ethnic group',
  'Ethnicity: Prefer not to say',
  'Ethnicity: White other',
  'Region: East Midlands',
  'Region: East of England',
  'Region: North East',
  'Region: North West',
  'Region: Northern Ireland',
  'Region: Scotland',
  'Region: South East',
  'Region: South West',
  'Region: Wales',
  'Region: West Midlands',
  'Region: Yorkshire and the Humber',
  'Outsourcing: Outsourced',
  'Migration: Arrived within the last year',
  'Migration: Arrived within the last 3 years',
  'Migration: Arrived within the last 5 years',
  'Migration: Arrived within the last 10 years',
  'Migration: Arrived within the last 15 years',
  'Migration: Arrived within the last 20 years',
  'Migration: Arrived within the last 30 years',
  'Migration: Arrived more than 30 years ago',
  'Migration: Prefer not to say'
)
  
tab_model(mod_annual, pred.labels = labels, dv.labels = "Annual income")
Table 1: Linear regression predicting annual income
  Annual income
Predictors Estimates CI p
Intercept 23883.73 22626.81 – 25140.64 <0.001
Age 14.62 -6.00 – 35.24 0.165
Gender: Male 7027.53 6501.18 – 7553.88 <0.001
Gender: Other 944.17 -5768.31 – 7656.65 0.783
Gender: Prefer not to say 4159.04 -1081.10 – 9399.18 0.120
Education: Has degree 8194.74 7648.05 – 8741.42 <0.001
Education: Don't know -1971.76 -4097.27 – 153.76 0.069
Ethnicity: Arab/British Arab -167.76 -4857.92 – 4522.40 0.944
Ethnicity: Asian/Asian British -465.21 -1559.07 – 628.65 0.404
Ethnicity: Black/African/Caribbean/Black British -1178.97 -2782.72 – 424.79 0.150
Ethnicity: Mixed/Multiple ethnic group -1509.33 -3488.27 – 469.61 0.135
Ethnicity: Other ethnic group 3584.34 -1007.55 – 8176.22 0.126
Ethnicity: Prefer not to say -258.17 -1976.58 – 1460.23 0.768
Ethnicity: White other -626.35 -1988.83 – 736.13 0.368
Region: East Midlands -5770.04 -6985.32 – -4554.76 <0.001
Region: East of England -4073.94 -5215.88 – -2931.99 <0.001
Region: North East -4850.06 -6378.79 – -3321.33 <0.001
Region: North West -4475.61 -5581.54 – -3369.68 <0.001
Region: Northern Ireland -6545.93 -8319.18 – -4772.68 <0.001
Region: Scotland -5465.81 -6653.77 – -4277.85 <0.001
Region: South East -3406.44 -4446.13 – -2366.75 <0.001
Region: South West -5685.35 -6871.46 – -4499.25 <0.001
Region: Wales -5366.02 -6811.80 – -3920.25 <0.001
Region: West Midlands -5002.23 -6158.23 – -3846.22 <0.001
Region: Yorkshire and the Humber -5523.70 -6705.94 – -4341.47 <0.001
Outsourcing: Outsourced -2951.02 -3660.76 – -2241.28 <0.001
Migration: Arrived within the last year -6135.94 -8377.00 – -3894.88 <0.001
Migration: Arrived within the last 3 years -2391.85 -4413.30 – -370.39 0.020
Migration: Arrived within the last 5 years -2030.80 -4260.26 – 198.65 0.074
Migration: Arrived within the last 10 years -609.19 -2354.44 – 1136.07 0.494
Migration: Arrived within the last 15 years 743.99 -1195.56 – 2683.54 0.452
Migration: Arrived within the last 20 years 1334.93 -748.43 – 3418.29 0.209
Migration: Arrived within the last 30 years 3501.23 1081.94 – 5920.53 0.005
Migration: Arrived more than 30 years ago -198.94 -2116.97 – 1719.08 0.839
Migration: Prefer not to say -2092.23 -5625.02 – 1440.55 0.246
Observations 8291
R2 / R2 adjusted 0.184 / 0.180

As expected, the model statistics for weekly income model were identical to the those of the annual income model. The model was statistically significant (R2 = 0.18, F(34, 8256) = 54.66, p < .001). The table below shows the coefficients for the weekly income model.

Code
tab_model(mod_weekly, pred.labels = labels, dv.labels = "Weekly income")
Table 2: Linear regression predicting weekly income
  Weekly income
Predictors Estimates CI p
Intercept 513.16 486.15 – 540.16 <0.001
Age 0.31 -0.13 – 0.76 0.165
Gender: Male 150.99 139.68 – 162.30 <0.001
Gender: Other 20.29 -123.94 – 164.51 0.783
Gender: Prefer not to say 89.36 -23.23 – 201.95 0.120
Education: Has degree 176.07 164.32 – 187.81 <0.001
Education: Don't know -42.36 -88.03 – 3.30 0.069
Ethnicity: Arab/British Arab -3.60 -104.38 – 97.17 0.944
Ethnicity: Asian/Asian British -10.00 -33.50 – 13.51 0.404
Ethnicity: Black/African/Caribbean/Black British -25.33 -59.79 – 9.13 0.150
Ethnicity: Mixed/Multiple ethnic group -32.43 -74.95 – 10.09 0.135
Ethnicity: Other ethnic group 77.01 -21.65 – 175.67 0.126
Ethnicity: Prefer not to say -5.55 -42.47 – 31.37 0.768
Ethnicity: White other -13.46 -42.73 – 15.82 0.368
Region: East Midlands -123.97 -150.08 – -97.86 <0.001
Region: East of England -87.53 -112.07 – -63.00 <0.001
Region: North East -104.21 -137.05 – -71.36 <0.001
Region: North West -96.16 -119.92 – -72.40 <0.001
Region: Northern Ireland -140.64 -178.74 – -102.54 <0.001
Region: Scotland -117.44 -142.96 – -91.91 <0.001
Region: South East -73.19 -95.53 – -50.85 <0.001
Region: South West -122.15 -147.64 – -96.67 <0.001
Region: Wales -115.29 -146.36 – -84.23 <0.001
Region: West Midlands -107.48 -132.31 – -82.64 <0.001
Region: Yorkshire and the Humber -118.68 -144.08 – -93.28 <0.001
Outsourcing: Outsourced -63.40 -78.65 – -48.16 <0.001
Migration: Arrived within the last year -131.83 -179.98 – -83.68 <0.001
Migration: Arrived within the last 3 years -51.39 -94.82 – -7.96 0.020
Migration: Arrived within the last 5 years -43.63 -91.53 – 4.27 0.074
Migration: Arrived within the last 10 years -13.09 -50.59 – 24.41 0.494
Migration: Arrived within the last 15 years 15.99 -25.69 – 57.66 0.452
Migration: Arrived within the last 20 years 28.68 -16.08 – 73.44 0.209
Migration: Arrived within the last 30 years 75.23 23.25 – 127.21 0.005
Migration: Arrived more than 30 years ago -4.27 -45.48 – 36.94 0.839
Migration: Prefer not to say -44.95 -120.86 – 30.95 0.246
Observations 8291
R2 / R2 adjusted 0.184 / 0.180

2 Outsourced workers are paid less across most sectors and in most jobs

2.1 Sectors

Code
sector_summary_pay <- data %>%
  filter(income_drop_all == 0 & !is.na(income_weekly_all)) %>%
  group_by(SectorName, SectorName_labelled, outsourcing_status) %>%
  summarise(
    n = n(),
    Frequency = sum(NatRepemployees),
    avg_income = mean(income_weekly_all, na.rm=T),
    wtd_avg_income = weighted.mean(income_weekly_all, w = NatRepemployees, na.rm=T)
  ) %>% 
  ungroup() %>%
  group_by(SectorName) %>%
  mutate(
    N = sum(n),
    Sum = sum(Frequency),
    perc = 100 * (Frequency/Sum),
    SectorName_labelled = case_when(SectorName_labelled == "NA" ~ NA,
                                    TRUE ~ SectorName_labelled),
    SectorName_short = SectorName_labelled
  ) %>%
  # make the sector names more readable
  separate_wider_delim(SectorName_short, names = c("SectorName_short", "SectorName_short_detail"), delim=";",
                       too_few = "align_start") %>%
  mutate(
    SectorName_short = factor(stringr::str_to_sentence(SectorName_short)),
    SectorName_short_detail = factor(stringr::str_to_sentence(SectorName_short_detail)),
  )

summary_weekly <- sector_summary_pay %>%
  group_by(SectorName_labelled) %>%
  mutate(
    min_n = min(n, na.rm=TRUE) 
  ) %>%
  filter(min_n >= 10) %>% # need to identify the unit occs that have an ok n
  ungroup()

write_csv(sector_summary_pay, file="../outputs/data/sector_summary_pay_weekly.csv")

plot_data <- sector_summary_pay %>%
  drop_na(SectorName_short) %>%
  droplevels() %>%
  ungroup()

# Filter for 'outsourced' level and reorder SectorName_short
not_outsourced_levels <- plot_data %>%
  filter(outsourcing_status == 'Not outsourced') %>%
  mutate(SectorName_short = forcats::fct_reorder(SectorName_short, N, .desc = FALSE))

# outsourced <- plot_data %>%
#   filter(outsourcing_status == 'Outsourced') %>%
#   mutate(
#     rank = rank(desc(perc))
#   )

# Apply the reordered levels back to the original data
plot_data <- plot_data %>%
  mutate(
    SectorName_short = factor(SectorName_short, levels = levels(not_outsourced_levels$SectorName_short)),
        ) %>%
  arrange(desc(SectorName_short))

annotation_df <- plot_data %>%
  #filter(outsourcing_status == "Not outsourced") %>%
  dplyr::select(SectorName_short, n) %>%
  group_by(SectorName_short) %>%
  summarise(
    N = sum(n)
  ) %>%
    mutate(
    ypos = max(plot_data$wtd_avg_income, na.rm=T) * 1.2
  ) 

plot_data %>%
  # mutate(
  #   SectorName = as.factor(SectorName)
  # ) %>%
  ggplot(., aes(wtd_avg_income,SectorName_short, size = perc, colour = outsourcing_status)) +
    geom_point(position = "dodge") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.title = element_blank())+
      #coord_flip() +
  scale_x_continuous(breaks=seq(0,max(plot_data$wtd_avg_income, na.rm=T), 100)) +
  scale_colour_manual(values=colours) +
  geom_text(inherit.aes=F,data=annotation_df, aes(x=ypos, y=SectorName_short, label = paste0("N = ", N)), hjust=1) +
  geom_label_repel(inherit.aes = F, aes(wtd_avg_income, SectorName_short, colour = outsourcing_status, label=paste0("n=",n)), size=3) +
  guides(size=FALSE) + # remove size legend as gauging size is difficult 
  xlab("Weighted average weekly income") + ylab("Sector") +
  labs(caption = "Size of bubble represents the size of the respective workforce within the sector")

Code
sectors_of_interest <- unique(plot_data$SectorName_labelled)
sectors_of_interest <- sectors_of_interest[1:13] %>%
  stringr::str_to_title()

2.2 Minor sub groups across sectors

Code
unit_occ_summary_pay <- data %>%
  filter(income_drop_all == 0 & !is.na(income_weekly_all)) %>%
  group_by(UnitOccupation_labelled, outsourcing_status) %>%
  summarise(
    n = n(),
    Frequency = sum(NatRepemployees),
    avg_income = mean(income_weekly_all, na.rm=T),
    wtd_avg_income = weighted.mean(income_weekly_all, w = NatRepemployees, na.rm=T)
  ) %>% 
  ungroup() %>%
  group_by(UnitOccupation_labelled) %>%
  mutate(
    N = sum(n),
    Sum = sum(Frequency),
    perc = 100 * (Frequency/Sum),
    UnitOccupation_labelled = case_when(UnitOccupation_labelled == "NA" ~ NA,
                                    TRUE ~ UnitOccupation_labelled),
    UnitOccupation_labelled = stringr::str_to_title(UnitOccupation_labelled)
  )  %>%
  ungroup()

# write_csv(unit_occ_summary_pay, file="../outputs/data/minor_group_occupation_summary_pay_weekly_across_sectors.csv")

# need to identify the unit occs that have an ok n
# subste to occs with n>=10
unit_subset_weekly <- unit_occ_summary_pay %>%
  group_by(UnitOccupation_labelled) %>%
  mutate(
    min_n = min(n, na.rm=TRUE)
  ) %>%
  filter(min_n >= 10)

unit_subset <- unit_subset_weekly


# create a df with occs where outsourced paid less so we can just list it
paid_less <- unit_subset %>%
  pivot_wider(id_cols = c(UnitOccupation_labelled), names_from = outsourcing_status, values_from = c(wtd_avg_income, n)) %>%
  janitor::clean_names() %>%
  mutate(
    pay_penalty = wtd_avg_income_outsourced - wtd_avg_income_not_outsourced
  ) %>%
  filter(
    pay_penalty < 0
  )

# write_csv(paid_less, file="../outputs/data/minor_group_occupation_weekly_pay_penalty_across_sectors.csv")

#print(sector)
# subset to this sector and drop na occupatoins
plot_data <- unit_subset %>%
  filter(!is.na(UnitOccupation_labelled)) %>%
  droplevels() %>%
  ungroup()

# Order occs by N
# First filter for 'outsourced' level and reorder by N
not_outsourced_levels <- plot_data %>%
  dplyr::select(UnitOccupation_labelled, outsourcing_status, N) %>%
  distinct(UnitOccupation_labelled, N) %>%
  mutate(UnitOccupation_labelled = forcats::fct_reorder(UnitOccupation_labelled, N, .desc = FALSE))

# not_outsourced_levels <- plot_data %>%
#   filter(outsourcing_status == 'Not outsourced') %>%
#   mutate(UnitOccupation_labelled = forcats::fct_reorder(UnitOccupation_labelled, N, .desc = FALSE))
# Then apply the reordered levels back to the original data
plot_data <- plot_data %>%
  mutate(
    UnitOccupation_labelled = factor(UnitOccupation_labelled, levels = levels(not_outsourced_levels$UnitOccupation_labelled)),
        )

annotation_df <- plot_data %>%
  #filter(outsourcing_status == "Not outsourced") %>%
  dplyr::select(UnitOccupation_labelled, n) %>%
  group_by(UnitOccupation_labelled) %>%
  summarise(
    N = sum(n)
  ) %>%
    mutate(
    ypos = max(plot_data$wtd_avg_income, na.rm=T) * 1.2
  )


p <- plot_data %>%
  ggplot(., aes(wtd_avg_income, UnitOccupation_labelled, size = perc, colour = outsourcing_status)) +
  geom_point(position = "dodge") + 
geom_label_repel(inherit.aes = F, aes(wtd_avg_income, UnitOccupation_labelled, colour = outsourcing_status, label=paste0("n=",n)), size=3, 
                #force_pull = 2
                ) + 
  theme_minimal() +
  theme(legend.position = "bottom",
      legend.title = element_blank()) +

  #coord_flip() +
scale_x_continuous(breaks=scales::breaks_pretty(n=5)) +
  #breaks=seq(0,max(plot_data$wtd_avg_income, na.rm=T), 200)) +
scale_colour_manual(values=colours) +
geom_text(inherit.aes=F,data=annotation_df, aes(x=ypos, y=UnitOccupation_labelled, label = paste0("N = ", N)), hjust=1)  +
guides(size=FALSE) + # remove size legend as gauging size is difficult 
xlab("Weighted average weekly income") + ylab("Unit occupation") +
labs(caption = "Size of bubble represents the size of the respective workforce within the occupation") +
  ggtitle("All sectors")

show(p)

Code
# ggsave(here('outputs','figures','occupation_pay_plots','unit_occupation_pay_plot_all_sectors.png'), height = 8, width = 8, dpi=800, bg="white")

2.3 Minor subgroups within sectors

Code
unit_occ_in_sect_summary_pay <- data %>%
  filter(income_drop_all == 0 & !is.na(income_weekly_all)) %>%
  group_by(SectorName, SectorName_labelled, UnitOccupation_labelled, outsourcing_status) %>%
  summarise(
    n = n(),
    Frequency = sum(NatRepemployees),
    avg_income = mean(income_weekly_all, na.rm=T),
    wtd_avg_income = weighted.mean(income_weekly_all, w = NatRepemployees, na.rm=T)
  ) %>% 
  ungroup() %>%
  group_by(SectorName_labelled, UnitOccupation_labelled) %>%
  mutate(
    N = sum(n),
    Sum = sum(Frequency),
    perc = 100 * (Frequency/Sum),
    UnitOccupation_labelled = case_when(UnitOccupation_labelled == "NA" ~ NA,
                                    TRUE ~ UnitOccupation_labelled),
    UnitOccupation_labelled = stringr::str_to_title(UnitOccupation_labelled),
    SectorName_labelled = stringr::str_to_title(SectorName_labelled)
    
  )  %>%
  ungroup()

summary_weekly <- unit_occ_in_sect_summary_pay %>%
  group_by(SectorName_labelled,UnitOccupation_labelled) %>%
  mutate(
    min_n = min(n, na.rm=TRUE) 
  ) %>%
  filter(min_n >= 10) %>% # need to identify the unit occs that have an ok n
  ungroup() %>%
  mutate(
    pay_frame = "weekly"
  )

# write_csv(unit_occ_in_sect_summary_pay, file="../outputs/data/minor_group_occupation_in_sector_summary_pay_weekly.csv")


unit_subset <- summary_weekly


# create a df with occs where outsourced paid less so we can just list it
paid_less <- unit_subset %>%
  pivot_wider(id_cols = c(SectorName_labelled, UnitOccupation_labelled), names_from = outsourcing_status, values_from = c(wtd_avg_income, n)) %>%
  janitor::clean_names() %>%
  mutate(
    pay_penalty = wtd_avg_income_outsourced - wtd_avg_income_not_outsourced
  ) %>%
  filter(
    pay_penalty < 0
  )

# write_csv(paid_less, file="../outputs/data/minor_group_occupation_in_sector_weekly_pay_penalty.csv")


for(sector in sectors_of_interest){
    #print(sector)
    # subset to this sector and drop na occupatoins
    plot_data <- unit_subset %>%
      filter(SectorName_labelled == sector) %>%
      filter(!is.na(UnitOccupation_labelled)) %>%
      droplevels() %>%
      ungroup()
    
    # Order occs by N
    # First filter for 'outsourced' level and reorder by N
    not_outsourced_levels <- plot_data %>%
      dplyr::select(UnitOccupation_labelled, outsourcing_status, N) %>%
      distinct(UnitOccupation_labelled, N) %>%
      mutate(UnitOccupation_labelled = forcats::fct_reorder(UnitOccupation_labelled, N, .desc = FALSE))
    
    # not_outsourced_levels <- plot_data %>%
    #   filter(outsourcing_status == 'Not outsourced') %>%
    #   mutate(UnitOccupation_labelled = forcats::fct_reorder(UnitOccupation_labelled, N, .desc = FALSE))
    # Then apply the reordered levels back to the original data
    plot_data <- plot_data %>%
      mutate(
        UnitOccupation_labelled = factor(UnitOccupation_labelled, levels = levels(not_outsourced_levels$UnitOccupation_labelled)),
            )
    
    annotation_df <- plot_data %>%
      #filter(outsourcing_status == "Not outsourced") %>%
      dplyr::select(UnitOccupation_labelled, n) %>%
      group_by(UnitOccupation_labelled) %>%
      summarise(
        N = sum(n)
      ) %>%
        mutate(
        ypos = max(plot_data$wtd_avg_income, na.rm=T) * 1.2
      )
    
    
    p <- plot_data %>%
      ggplot(., aes(wtd_avg_income, UnitOccupation_labelled, size = perc, colour = outsourcing_status)) +
      geom_point(position = "dodge") + 
    geom_label_repel(inherit.aes = F, aes(wtd_avg_income, UnitOccupation_labelled, colour = outsourcing_status, label=paste0("n=",n)), size=3, 
                    #force_pull = 2
                    ) + 
      theme_minimal() +
      theme(legend.position = "bottom",
          legend.title = element_blank()) +

      #coord_flip() +
    scale_x_continuous(breaks=scales::breaks_pretty(n=5)) +
      #breaks=seq(0,max(plot_data$wtd_avg_income, na.rm=T), 200)) +
    scale_colour_manual(values=colours) +
    geom_text(inherit.aes=F,data=annotation_df, aes(x=ypos, y=UnitOccupation_labelled, label = paste0("N = ", N)), hjust=1)  +
    guides(size=FALSE) + # remove size legend as gauging size is difficult 
    xlab("Weighted average weekly income") + ylab("Unit occupation") +
    labs(caption = "Size of bubble represents the size of the respective workforce within the occupation") +
      ggtitle(sector)
    
    show(p)
    
    # ggsave(here('outputs','figures','occupation_pay_plots',paste0('unit_occupation_pay_plot_weekly_', sector, '.png')), height = 8, width = 8, dpi=800, bg="white")
}

2.4 Major subgroups across sectors

Code
major_occ_summary_pay <- data %>%
  filter(income_drop_all == 0 & !is.na(income_weekly_all)) %>%
  group_by(MajorsubgroupOccupation_labelled, outsourcing_status) %>%
  summarise(
    n = n(),
    Frequency = sum(NatRepemployees),
    avg_income = mean(income_weekly_all, na.rm=T),
    wtd_avg_income = weighted.mean(income_weekly_all, w = NatRepemployees, na.rm=T)
  ) %>% 
  ungroup() %>%
  group_by(MajorsubgroupOccupation_labelled) %>%
  mutate(
    N = sum(n),
    Sum = sum(Frequency),
    perc = 100 * (Frequency/Sum),
    MajorsubgroupOccupation_labelled = case_when(MajorsubgroupOccupation_labelled == "NA" ~ NA,
                                    TRUE ~ MajorsubgroupOccupation_labelled),
    MajorsubgroupOccupation_labelled = stringr::str_to_title(MajorsubgroupOccupation_labelled)
  )  %>%
  ungroup()

# write_csv(major_occ_summary_pay, file="../outputs/data/major_subgroup_across_sectors_occupation_summary_pay_weekly.csv")

# need to identify the unit occs that have an ok n
# subste to occs with n>=10
unit_subset_weekly <- major_occ_summary_pay %>%
  group_by(MajorsubgroupOccupation_labelled) %>%
  mutate(
    min_n = min(n, na.rm=TRUE)
  ) %>%
  filter(min_n >= 10)

unit_subset <- unit_subset_weekly


# create a df with occs where outsourced paid less so we can just list it
paid_less <- unit_subset %>%
  pivot_wider(id_cols = c(MajorsubgroupOccupation_labelled), names_from = outsourcing_status, values_from = c(wtd_avg_income, n)) %>%
  janitor::clean_names() %>%
  mutate(
    pay_penalty = wtd_avg_income_outsourced - wtd_avg_income_not_outsourced
  ) %>%
  filter(
    pay_penalty < 0
  )

# write_csv(paid_less, file="../outputs/data/major_subgroup_occupation_weekly_pay_penalty_across_sectors.csv")

#print(sector)
# subset to this sector and drop na occupatoins
plot_data <- unit_subset %>%
  filter(!is.na(MajorsubgroupOccupation_labelled)) %>%
  droplevels() %>%
  ungroup()

# Order occs by N
# First filter for 'outsourced' level and reorder by N
not_outsourced_levels <- plot_data %>%
  dplyr::select(MajorsubgroupOccupation_labelled, outsourcing_status, N) %>%
  distinct(MajorsubgroupOccupation_labelled, N) %>%
  mutate(MajorsubgroupOccupation_labelled = forcats::fct_reorder(MajorsubgroupOccupation_labelled, N, .desc = FALSE))

# not_outsourced_levels <- plot_data %>%
#   filter(outsourcing_status == 'Not outsourced') %>%
#   mutate(MajorsubgroupOccupation_labelled = forcats::fct_reorder(MajorsubgroupOccupation_labelled, N, .desc = FALSE))
# Then apply the reordered levels back to the original data
plot_data <- plot_data %>%
  mutate(
    MajorsubgroupOccupation_labelled = factor(MajorsubgroupOccupation_labelled, levels = levels(not_outsourced_levels$MajorsubgroupOccupation_labelled)),
        )

annotation_df <- plot_data %>%
  #filter(outsourcing_status == "Not outsourced") %>%
  dplyr::select(MajorsubgroupOccupation_labelled, n) %>%
  group_by(MajorsubgroupOccupation_labelled) %>%
  summarise(
    N = sum(n)
  ) %>%
    mutate(
    ypos = max(plot_data$wtd_avg_income, na.rm=T) * 1.2
  )


p <- plot_data %>%
  ggplot(., aes(wtd_avg_income, MajorsubgroupOccupation_labelled, size = perc, colour = outsourcing_status)) +
  geom_point(position = "dodge") + 
geom_label_repel(inherit.aes = F, aes(wtd_avg_income, MajorsubgroupOccupation_labelled, colour = outsourcing_status, label=paste0("n=",n)), size=3, 
                #force_pull = 2
                ) + 
  theme_minimal() +
  theme(legend.position = "bottom",
      legend.title = element_blank()) +

  #coord_flip() +
scale_x_continuous(breaks=scales::breaks_pretty(n=5)) +
  #breaks=seq(0,max(plot_data$wtd_avg_income, na.rm=T), 200)) +
scale_colour_manual(values=colours) +
geom_text(inherit.aes=F,data=annotation_df, aes(x=ypos, y=MajorsubgroupOccupation_labelled, label = paste0("N = ", N)), hjust=1)  +
guides(size=FALSE) + # remove size legend as gauging size is difficult 
xlab("Weighted average weekly income") + ylab("Major sub group occupation") +
labs(caption = "Size of bubble represents the size of the respective workforce within the occupation") +
  ggtitle("All sectors")

show(p)

Code
# ggsave(here('outputs','figures','occupation_pay_plots','major_subgroup_occupation_all_sectors_pay_plot.png'), height = 8, width = 8, dpi=800, bg="white")

2.5 Major subgroups within sectors

Code
occ_in_sect_summary_pay <- data %>%
  filter(income_drop_all == 0 & !is.na(income_weekly_all)) %>%
  group_by(SectorName, SectorName_labelled, MajorsubgroupOccupation_labelled, outsourcing_status) %>%
  summarise(
    n = n(),
    Frequency = sum(NatRepemployees),
    avg_income = mean(income_weekly_all, na.rm=T),
    wtd_avg_income = weighted.mean(income_weekly_all, w = NatRepemployees, na.rm=T)
  ) %>% 
  ungroup() %>%
  group_by(SectorName_labelled, MajorsubgroupOccupation_labelled) %>%
  mutate(
    N = sum(n),
    Sum = sum(Frequency),
    perc = 100 * (Frequency/Sum),
    MajorsubgroupOccupation_labelled = case_when(MajorsubgroupOccupation_labelled == "NA" ~ NA,
                                    TRUE ~ MajorsubgroupOccupation_labelled),
    MajorsubgroupOccupation_labelled = stringr::str_to_title(MajorsubgroupOccupation_labelled),
    SectorName_labelled = stringr::str_to_title(SectorName_labelled)
    
  ) 

summary_weekly <- occ_in_sect_summary_pay %>%
  group_by(SectorName_labelled,MajorsubgroupOccupation_labelled) %>%
  mutate(
    min_n = min(n, na.rm=TRUE) 
  ) %>%
  filter(min_n >= 10) %>% # need to identify the unit occs that have an ok n
  ungroup() %>%
  mutate(
    pay_frame = "weekly"
  )

# write_csv(occ_in_sect_summary_pay, file="../outputs/data/major_subgroup_occupation_in_sector_summary_pay_weekly.csv")


for(sector in sectors_of_interest){
    #print(sector)
    # subset to this sector and drop na occupatoins
    plot_data <- occ_in_sect_summary_pay %>%
      filter(SectorName_labelled == sector) %>%
      filter(!is.na(MajorsubgroupOccupation_labelled)) %>%
      droplevels() %>%
      ungroup()
    
    # Order occs by N
    # First filter for 'outsourced' level and reorder by N
    not_outsourced_levels <- plot_data %>%
      dplyr::select(MajorsubgroupOccupation_labelled, outsourcing_status, N) %>%
      distinct(MajorsubgroupOccupation_labelled, N) %>%
      mutate(MajorsubgroupOccupation_labelled = forcats::fct_reorder(MajorsubgroupOccupation_labelled, N, .desc = FALSE))
    
    # not_outsourced_levels <- plot_data %>%
    #   filter(outsourcing_status == 'Not outsourced') %>%
    #   mutate(MajorsubgroupOccupation_labelled = forcats::fct_reorder(MajorsubgroupOccupation_labelled, N, .desc = FALSE))
    # Then apply the reordered levels back to the original data
    plot_data <- plot_data %>%
      mutate(
        MajorsubgroupOccupation_labelled = factor(MajorsubgroupOccupation_labelled, levels = levels(not_outsourced_levels$MajorsubgroupOccupation_labelled)),
            )
    
    annotation_df <- plot_data %>%
      #filter(outsourcing_status == "Not outsourced") %>%
      dplyr::select(MajorsubgroupOccupation_labelled, n) %>%
      group_by(MajorsubgroupOccupation_labelled) %>%
      summarise(
        N = sum(n)
      ) %>%
        mutate(
        ypos = max(plot_data$wtd_avg_income, na.rm=T) * 1.2
      )
    
    p <- plot_data %>%
      ggplot(., aes(wtd_avg_income, MajorsubgroupOccupation_labelled, size = perc, colour = outsourcing_status)) +
      geom_point(position = "dodge") + 
    geom_label_repel(inherit.aes = F, aes(wtd_avg_income, MajorsubgroupOccupation_labelled, colour = outsourcing_status, label=paste0("n=",n)), size=3, 
                    #force_pull = 2
                    ) + 
      theme_minimal() +
      theme(legend.position = "bottom",
          legend.title = element_blank()) +

      #coord_flip() +
    scale_x_continuous(breaks=seq(0,max(plot_data$wtd_avg_income, na.rm=T), 200)) +
    scale_colour_manual(values=colours) +
    geom_text(inherit.aes=F,data=annotation_df, aes(x=ypos, y=MajorsubgroupOccupation_labelled, label = paste0("N = ", N)), hjust=1)  +
    guides(size=FALSE) + # remove size legend as gauging size is difficult 
    xlab("Weighted average weekly income") + ylab("Major subgroup occupation") +
    labs(caption = "Size of bubble represents the size of the respective workforce within the occupation") +
      ggtitle(sector)
    
    show(p)
    
    # ggsave(here('outputs','figures','occupation_pay_plots',paste0('major_subgroup_occupation_pay_plot_weekly_', sector, '.png')), height = 8, width = 8, dpi=800, bg="white")
}

3 This group of workers is varied, spread across sectors and regions (chapter 2)

Code
sector_summary_3 <- data %>%
  #filter(income_drop_all == 0) %>%
  group_by(SectorName, SectorName_labelled, outsourcing_status) %>%
  summarise(
    n = n(),
    Frequency = sum(NatRepemployees),
    # avg_income = mean(income_annual_all, na.rm=T),
    # wtd_avg_income = weighted.mean(income_annual_all, w = NatRepemployees, na.rm=T)
  ) %>% 
  ungroup() %>%
  group_by(SectorName) %>%
  mutate(
    N = sum(n),
    Sum = sum(Frequency),
    perc = 100 * (Frequency/Sum),
    SectorName_labelled = case_when(SectorName_labelled == "NA" ~ NA,
                                    TRUE ~ SectorName_labelled),
    SectorName_short = SectorName_labelled
  ) %>%
  # make the sector names more readable
  separate_wider_delim(SectorName_short, names = c("SectorName_short", "SectorName_short_detail"), delim=";",
                       too_few = "align_start") %>%
  mutate(
    SectorName_short = factor(stringr::str_to_sentence(SectorName_short)),
    SectorName_short_detail = factor(stringr::str_to_sentence(SectorName_short_detail)),
  )

# write_csv(sector_summary_3, file="../outputs/data/sector_summary_3.csv")

The plot below shows the proportion of outsourced and not outsourced workers within each sector. I.e. this is showing what sectors have higher and lower proportions of outsourced workers.

Code
plot_data <- sector_summary_3 %>%
  drop_na(SectorName_short) %>%
    droplevels() %>%
  ungroup()

# Filter for 'outsourced' level and reorder SectorName_short
not_outsourced_levels <- plot_data %>%
  filter(outsourcing_status == 'Not outsourced') %>%
  mutate(SectorName_short = forcats::fct_reorder(SectorName_short, perc, .desc = TRUE))

outsourced <- plot_data %>%
  filter(outsourcing_status == 'Outsourced') %>%
  mutate(
    rank = rank(desc(perc))
  )

# Apply the reordered levels back to the original data
plot_data <- plot_data %>%
  mutate(
    SectorName_short = factor(SectorName_short, levels = levels(not_outsourced_levels$SectorName_short)),
         )

# annotation_df <- plot_data %>%
#   dplyr::select(SectorName_short, outsourcing_status, perc, n
# mutate(
  
annotation_df <- plot_data %>%
  filter(outsourcing_status == "Not outsourced") %>%
  dplyr::select(SectorName_short, N) %>%
  mutate(
    ypos = 80
  )

ggplot(plot_data, aes(SectorName_short, perc, fill = outsourcing_status)) +
  geom_col() +
  geom_text(inherit.aes=F,data=annotation_df, aes(x=SectorName_short, y=ypos, label = paste0("N = ", N)), hjust=1, nudge_y = 15) +
  coord_flip() +
  scale_fill_manual(values=many_colours) +
  scale_y_continuous(breaks=seq(0,100,10))

Code
# sector_key <- data.frame("number" = seq(1,length(unique(plot_data$SectorName_labelled)),1),
#                           "Sector" = levels(plot_data$SectorName_labelled))
# 
# sector_key %>%
#   kable() %>%
#   kable_styling(full_width = F)

The top three Sectors with the highest proportion of outsourced workers are:

  • ACTIVITIES OF HOUSEHOLDS AS EMPLOYERS; UNDIFFERENTIATED GOODS-AND SERVICES-PRODUCING ACTIVITIES OF HOUSEHOLDS FOR OWN US (note that N = 31)
  • ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES
  • WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES

Note that for an undefined sector (‘Not found’) contained one of the largest proportions of outsourced workers (31% of workers in the ‘Not found’ category were outsourced).

A key takeaway here is that whereas the total outsourced population is 17%, this figure varies by sector, from 0% for Mining… and Extraterritoral organisations… all the way to 36% for Activities of households as employers, with 5 out 20 sectors having at least 20% of their workforce outsourced.

The plot below shows the proportion of workers within each region who are outsourced.

Code
region_statistics_2 <- data %>%
  # get values of labels
  # mutate_all(haven::as_factor) %>%
  group_by(Region, outsourcing_status) %>%
  summarise(
    Frequency = sum(NatRepemployees),
    n = n(),
  ) %>%
  mutate(
    N = sum(n),
    Sum = sum(Frequency),
    Percentage = 100 * (Frequency / Sum)
  ) %>%
  rename(
    `Outsourcing status` = outsourcing_status
  ) %>%
  ungroup()

reg_levels <- region_statistics_2 %>%
  filter(`Outsourcing status` == "Outsourced") %>%
  mutate(
    Region = forcats::fct_reorder(Region, Percentage, .desc=FALSE)
  )

annotation_df <- region_statistics_2 %>%
  filter(`Outsourcing status` == "Not outsourced") %>%
  dplyr::select(Region, N) %>%
  mutate(
    ypos = 100
  )

region_statistics_2 %>%
  mutate(
    Region = factor(Region, levels = levels(reg_levels$Region))
  ) %>%
  ggplot(., aes(Region, Percentage, fill = `Outsourcing status`)) +
  geom_col(colour="black") +
  geom_text(inherit.aes=F, data = annotation_df, aes(Region, ypos, label = paste0("N=",N)), hjust=1, nudge_y = -2) +
  coord_flip() +
  scale_fill_manual(values=many_colours) +
  theme_minimal()

Code
# readr::write_csv(region_statistics_2, file = "../outputs/data/region_stats_2.csv")

region_statistics_2_1 <- region_statistics_2 %>% 
  filter(`Outsourcing status` == "Outsourced" & Region != "London")

london_perc <- region_statistics_2[which(region_statistics_2$Region == "London" & region_statistics_2["Outsourcing status"] == "Outsourced"), "Percentage"]
Code
region_statistics_3 <- data %>%
  filter(outsourcing_status == "Outsourced") %>%
  # get values of labels
  # mutate_all(haven::as_factor) %>%
  group_by(Region) %>%
  summarise(
    Frequency = sum(NatRepemployees)
  ) %>%
  mutate(
    Sum = sum(Frequency),
    Percentage = 100 * (Frequency / Sum)
  )

# readr::write_csv(region_statistics_3, file = "../outputs/data/region_stats_3.csv")

We can also explore how the the entire outsourced workforce in the UK is distributed across the country. The table below shows the percentage of outsourced workers in each region as a proportion of the total UK workforce. They show where the UK’s outsourced workforce is concentrated.

Code
region_statistics_3 %>%
  mutate(
    Region = haven::as_factor(Region)
    ) %>%
  arrange(desc(Percentage)) %>%
  knitr::kable(.,digits = 2) %>%
  kable_styling(full_width = F)
Table 3: Distribution of outsourced workers across regions
Region Frequency Sum Percentage
London 357.35 1708.36 20.92
North West 189.39 1708.36 11.09
South East 188.47 1708.36 11.03
West Midlands 161.49 1708.36 9.45
East Midlands 140.50 1708.36 8.22
Scotland 125.82 1708.36 7.37
East of England 125.49 1708.36 7.35
South West 120.50 1708.36 7.05
Yorkshire and the Humber 119.46 1708.36 6.99
Wales 83.25 1708.36 4.87
North East 53.06 1708.36 3.11
Northern Ireland 43.56 1708.36 2.55

4 Our data indicates a racialised picture of outsourcing in the UK

All models were run with a quasibinomial distribution to account for the survey weights. Because quasibinomial models don’t involve likelihoods, model fit could not be determined using information criteria. Instead, we use Tjur’s pseudo R2 to appraise model fit. This approach uses predicted probabilities from the model to calculate the average predicted probability for each level of the outcome variable, then takes the difference in this average between the two levels. The result is a figure between 0 and 1 that estimates how different the average predicted probability in the outsourced group is from the average predicted probability in the not outsourced group. Values closer to 1 indicate the averages are more different and therefore that the model is more accurate in discriminating between the two groups.

Code
mod2 <- glm(outsourcing_status ~ Ethnicity_binary + Age + Gender + Has_Degree +  Region + BORNUK_labelled, data, weights = NatRepemployees, family="quasibinomial")

tjur2 <- data %>%
  mutate(pred = predict(mod2, type = "response")) %>%
  group_by(outsourcing_status) %>%
  summarise(weighted_mean = weighted.mean(pred, NatRepemployees)) %>%
  summarise(tjur_r2 = diff(weighted_mean)) %>%
  pull()

coefs <- extract_glm_coefs(mod2, only_sig=T)
ethn_coef <- coefs %>% filter(stringr::str_detect(variable, "Ethnicity"))

For model 1, the value of Tjur’s pseudo R2 is 0.036. The table below shows the model coefficients.

Code
labels <- c(
  'Intercept',
  'Ethnicity: Not White',
  'Age',
  'Gender: Male',
  'Gender: Other',
  'Gender: Prefer not to say',
  'Education: Has degree',
  "Education: Don't know",
  'Region: East Midlands',
  'Region: East of England',
  'Region: North East',
  'Region: North West',
  'Region: Northern Ireland',
  'Region: Scotland',
  'Region: South East',
  'Region: South West',
  'Region: Wales',
  'Region: West Midlands',
  'Region: Yorkshire and the Humber',
  'Migration: Arrived within the last year',
  'Migration: Arrived within the last 3 years',
  'Migration: Arrived within the last 5 years',
  'Migration: Arrived within the last 10 years',
  'Migration: Arrived within the last 15 years',
  'Migration: Arrived within the last 20 years',
  'Migration: Arrived within the last 30 years',
  'Migration: Arrived more than 30 years ago',
  'Migration: Prefer not to say'
)


tab_model(mod2, 
          pred.labels = labels, 
          dv.labels = "Outsourcing",
          show.r2 = FALSE)
  Outsourcing
Predictors Odds Ratios CI p
Intercept 0.51 0.40 – 0.64 <0.001
Ethnicity: Not White 1.18 1.01 – 1.37 0.031
Age 0.98 0.97 – 0.98 <0.001
Gender: Male 1.42 1.27 – 1.58 <0.001
Gender: Other 1.11 0.22 – 3.70 0.876
Gender: Prefer not to say 0.80 0.34 – 1.64 0.575
Education: Has degree 0.93 0.83 – 1.04 0.187
Education: Don't know 0.94 0.62 – 1.38 0.743
Region: East Midlands 0.89 0.71 – 1.12 0.314
Region: East of England 0.58 0.46 – 0.73 <0.001
Region: North East 0.62 0.44 – 0.84 0.003
Region: North West 0.83 0.67 – 1.02 0.081
Region: Northern Ireland 0.74 0.51 – 1.04 0.091
Region: Scotland 0.68 0.54 – 0.86 0.001
Region: South East 0.60 0.49 – 0.73 <0.001
Region: South West 0.65 0.51 – 0.82 <0.001
Region: Wales 0.86 0.65 – 1.12 0.269
Region: West Midlands 0.83 0.67 – 1.03 0.087
Region: Yorkshire and the Humber 0.66 0.52 – 0.83 <0.001
Migration: Arrived within the last year 1.97 1.36 – 2.83 <0.001
Migration: Arrived within the last 3 years 1.12 0.75 – 1.63 0.565
Migration: Arrived within the last 5 years 1.16 0.77 – 1.70 0.462
Migration: Arrived within the last 10 years 1.69 1.26 – 2.24 <0.001
Migration: Arrived within the last 15 years 1.58 1.13 – 2.17 0.006
Migration: Arrived within the last 20 years 1.69 1.18 – 2.38 0.004
Migration: Arrived within the last 30 years 0.50 0.26 – 0.87 0.023
Migration: Arrived more than 30 years ago 2.05 1.45 – 2.87 <0.001
Migration: Prefer not to say 1.18 0.74 – 1.83 0.471
Observations 10155
Code
mod <- glm(outsourcing_status ~ Ethnicity_collapsed + Age + Gender + Has_Degree + Region + BORNUK_labelled, data, weights = NatRepemployees, family="quasibinomial")

tjur <- data %>%
  mutate(pred = predict(mod, type = "response")) %>%
  group_by(outsourcing_status) %>%
  summarise(weighted_mean = weighted.mean(pred, NatRepemployees)) %>%
  summarise(tjur_r2 = diff(weighted_mean)) %>%
  pull()

coefs <- extract_glm_coefs(mod, only_sig=T)

ethn_coef <- coefs %>% filter(stringr::str_detect(variable, "Ethnicity"))

For model 2, the value of Tjur’s pseudo R2 for this model is 0.039. The table below shows the model coefficients.

Code
labels <- c(
  'Intercept',
  'Ethnicity: Arab/British Arab',
  'Ethnicity: Asian/Asian British',
  'Ethnicity: Black/African/Caribbean/Black British',
  'Ethnicity: Mixed/Multiple ethnic group',
  'Ethnicity: Other ethnic group',
  'Ethnicity: Prefer not to say',
  'Ethnicity: White other',
  'Age',
  'Gender: Male',
  'Gender: Other',
  'Gender: Prefer not to say',
  'Education: Has degree',
  "Education: Don't know",
  'Region: East Midlands',
  'Region: East of England',
  'Region: North East',
  'Region: North West',
  'Region: Northern Ireland',
  'Region: Scotland',
  'Region: South East',
  'Region: South West',
  'Region: Wales',
  'Region: West Midlands',
  'Region: Yorkshire and the Humber',
  'Migration: Arrived within the last year',
  'Migration: Arrived within the last 3 years',
  'Migration: Arrived within the last 5 years',
  'Migration: Arrived within the last 10 years',
  'Migration: Arrived within the last 15 years',
  'Migration: Arrived within the last 20 years',
  'Migration: Arrived within the last 30 years',
  'Migration: Arrived more than 30 years ago',
  'Migration: Prefer not to say'
)

tab_model(mod, 
          pred.labels = labels, 
          dv.labels = "Outsourcing",
          show.r2 = FALSE)
  Outsourcing
Predictors Odds Ratios CI p
Intercept 0.50 0.39 – 0.63 <0.001
Ethnicity: Arab/British Arab 1.96 0.91 – 4.07 0.075
Ethnicity: Asian/Asian British 1.22 1.00 – 1.48 0.053
Ethnicity: Black/African/Caribbean/Black British 1.43 1.08 – 1.89 0.012
Ethnicity: Mixed/Multiple ethnic group 1.37 0.95 – 1.93 0.082
Ethnicity: Other ethnic group 1.01 0.40 – 2.28 0.981
Ethnicity: Prefer not to say 1.26 0.95 – 1.66 0.101
Ethnicity: White other 0.76 0.58 – 0.99 0.041
Age 0.98 0.97 – 0.98 <0.001
Gender: Male 1.41 1.27 – 1.57 <0.001
Gender: Other 1.18 0.24 – 3.91 0.808
Gender: Prefer not to say 0.76 0.32 – 1.59 0.502
Education: Has degree 0.91 0.82 – 1.02 0.111
Education: Don't know 0.95 0.63 – 1.40 0.793
Region: East Midlands 0.90 0.71 – 1.13 0.368
Region: East of England 0.58 0.46 – 0.73 <0.001
Region: North East 0.61 0.44 – 0.84 0.003
Region: North West 0.83 0.67 – 1.02 0.081
Region: Northern Ireland 0.82 0.57 – 1.16 0.267
Region: Scotland 0.69 0.54 – 0.87 0.002
Region: South East 0.60 0.49 – 0.74 <0.001
Region: South West 0.66 0.52 – 0.84 0.001
Region: Wales 0.86 0.65 – 1.13 0.297
Region: West Midlands 0.83 0.66 – 1.03 0.087
Region: Yorkshire and the Humber 0.65 0.51 – 0.83 <0.001
Migration: Arrived within the last year 1.85 1.26 – 2.68 0.001
Migration: Arrived within the last 3 years 1.11 0.75 – 1.63 0.583
Migration: Arrived within the last 5 years 1.28 0.85 – 1.88 0.227
Migration: Arrived within the last 10 years 2.05 1.50 – 2.76 <0.001
Migration: Arrived within the last 15 years 1.82 1.30 – 2.54 <0.001
Migration: Arrived within the last 20 years 1.89 1.31 – 2.70 <0.001
Migration: Arrived within the last 30 years 0.54 0.28 – 0.95 0.045
Migration: Arrived more than 30 years ago 2.10 1.48 – 2.94 <0.001
Migration: Prefer not to say 1.17 0.73 – 1.82 0.499
Observations 10155
Code
mod <- glm(outsourcing_status ~ Ethnicity_collapsed_disaggregated + Age + Gender + Has_Degree + Region + BORNUK_labelled, data, weights = NatRepemployees, family="quasibinomial")
summary(mod)

# Test waht amount of variacne the unweighted would explain - similar
# test <- glm(outsourcing_status ~ Ethnicity_collapsed_disaggregated + Age + Gender + Has_Degree + Region + BORNUK_labelled, data, family="binomial")
# with(summary(test), 1 - deviance/null.deviance)


tjur <- data %>%
  filter(!is.na(Ethnicity_collapsed_disaggregated)) %>%
  mutate(pred = predict(mod, type = "response")) %>%
  group_by(outsourcing_status) %>%
  summarise(weighted_mean = weighted.mean(pred, NatRepemployees)) %>%
  summarise(tjur_r2 = diff(weighted_mean)) %>%
  pull()

coefs <- extract_glm_coefs(mod, only_sig=T)

ethn_coef <- coefs %>% filter(stringr::str_detect(variable, "Ethnicity"))

For model 3, the value of Tjur’s pseudo R2 for this model is 0.043. The table below shows the model coefficients.

Code
labels <- c(
  'Intercept',
  "Ethnicity: Irish",
  "Ethnicity: Gypsy or Irish Traveller",
  "Ethnicity: Roma",
  "Ethnicity: Any other White background",
  "Ethnicity: White and Black Caribbean",
  "Ethnicity: White and Black African",
  "Ethnicity: White and Asian",
  "Ethnicity: Any other Mixed/Multiple ethnic background",
  "Ethnicity: Indian",
  "Ethnicity: Pakistani",
  "Ethnicity: Bangladeshi",
  "Ethnicity: Chinese",
  "Ethnicity: Any other Asian background",
  "Ethnicity: African",
  "Ethnicity: Caribbean",
  "Ethnicity: Any other Black, Black British, or Caribbean background",
  "Ethnicity: Arab",
  "Ethnicity: Any other ethnic group",
  "Ethnicity: Don't think of myself as any of these",
  "Ethnicity: Prefer not to say",
  'Age',
  'Gender: Male',
  'Gender: Other',
  'Gender: Prefer not to say',
  'Education: Has degree',
  "Education: Don't know",
  'Region: East Midlands',
  'Region: East of England',
  'Region: North East',
  'Region: North West',
  'Region: Northern Ireland',
  'Region: Scotland',
  'Region: South East',
  'Region: South West',
  'Region: Wales',
  'Region: West Midlands',
  'Region: Yorkshire and the Humber',
  'Migration: Arrived within the last year',
  'Migration: Arrived within the last 3 years',
  'Migration: Arrived within the last 5 years',
  'Migration: Arrived within the last 10 years',
  'Migration: Arrived within the last 15 years',
  'Migration: Arrived within the last 20 years',
  'Migration: Arrived within the last 30 years',
  'Migration: Arrived more than 30 years ago',
  'Migration: Prefer not to say'
)

tab_model(mod, 
          pred.labels = labels, 
          dv.labels = "Outsourcing",
          show.r2 = FALSE)
  Outsourcing
Predictors Odds Ratios CI p
Intercept 0.49 0.38 – 0.62 <0.001
Ethnicity: Irish 0.76 0.41 – 1.33 0.362
Ethnicity: Gypsy or Irish Traveller 1.00 0.18 – 3.75 0.998
Ethnicity: Roma 1.25 0.20 – 5.71 0.788
Ethnicity: Any other White background 0.80 0.59 – 1.08 0.145
Ethnicity: White and Black Caribbean 0.49 0.18 – 1.09 0.112
Ethnicity: White and Black African 2.66 1.30 – 5.30 0.006
Ethnicity: White and Asian 1.19 0.53 – 2.43 0.647
Ethnicity: Any other Mixed/Multiple ethnic background 2.00 1.04 – 3.68 0.031
Ethnicity: Indian 1.18 0.87 – 1.57 0.277
Ethnicity: Pakistani 2.17 1.51 – 3.08 <0.001
Ethnicity: Bangladeshi 1.32 0.78 – 2.19 0.289
Ethnicity: Chinese 0.62 0.37 – 1.00 0.059
Ethnicity: Any other Asian background 1.19 0.80 – 1.75 0.381
Ethnicity: African 1.44 1.02 – 2.03 0.037
Ethnicity: Caribbean 1.27 0.68 – 2.22 0.423
Ethnicity: Any other Black, Black British, or Caribbean background 1.84 0.87 – 3.69 0.095
Ethnicity: Arab 1.97 0.91 – 4.08 0.075
Ethnicity: Any other ethnic group 1.03 0.40 – 2.33 0.942
Ethnicity: Don't think of myself as any of these 2.31 0.45 – 9.63 0.266
Ethnicity: Prefer not to say 1.37 0.53 – 3.17 0.482
Age 0.98 0.97 – 0.98 <0.001
Gender: Male 1.44 1.29 – 1.61 <0.001
Gender: Other 1.27 0.26 – 4.28 0.726
Gender: Prefer not to say 1.20 0.36 – 3.10 0.739
Education: Has degree 0.95 0.84 – 1.06 0.348
Education: Don't know 1.09 0.70 – 1.66 0.679
Region: East Midlands 0.92 0.73 – 1.17 0.510
Region: East of England 0.57 0.45 – 0.73 <0.001
Region: North East 0.61 0.44 – 0.85 0.004
Region: North West 0.81 0.65 – 1.01 0.061
Region: Northern Ireland 0.73 0.48 – 1.06 0.107
Region: Scotland 0.67 0.52 – 0.85 0.001
Region: South East 0.60 0.49 – 0.74 <0.001
Region: South West 0.66 0.51 – 0.84 0.001
Region: Wales 0.88 0.66 – 1.16 0.368
Region: West Midlands 0.81 0.64 – 1.01 0.059
Region: Yorkshire and the Humber 0.65 0.51 – 0.83 0.001
Migration: Arrived within the last year 1.76 1.16 – 2.64 0.007
Migration: Arrived within the last 3 years 1.10 0.73 – 1.63 0.628
Migration: Arrived within the last 5 years 1.23 0.80 – 1.85 0.328
Migration: Arrived within the last 10 years 1.76 1.26 – 2.43 0.001
Migration: Arrived within the last 15 years 1.84 1.27 – 2.64 0.001
Migration: Arrived within the last 20 years 1.69 1.14 – 2.46 0.008
Migration: Arrived within the last 30 years 0.49 0.24 – 0.90 0.033
Migration: Arrived more than 30 years ago 2.11 1.48 – 2.98 <0.001
Migration: Prefer not to say 1.22 0.73 – 1.98 0.433
Observations 9812

We next focus on predicting whether a person was outsourced based on wehther the person was born in the UK. This binary variable was constructed by collapsing the 10-level migration variable down into two levels, so that “I was born in the UK” becomes “Born in UK”, and all levels apart from “I was born in the UK” become “Not born in UK”.

Code
mod <- glm(outsourcing_status ~ BORNUK_binary + Ethnicity_collapsed_disaggregated + Age + Gender + Has_Degree + Region, data, weights = NatRepemployees, family="quasibinomial")
summary(mod)

tjur <- data %>%
  filter(!is.na(Ethnicity_collapsed_disaggregated)) %>%
  mutate(pred = predict(mod, type = "response")) %>%
  group_by(outsourcing_status) %>%
  summarise(weighted_mean = weighted.mean(pred, NatRepemployees)) %>%
  summarise(tjur_r2 = diff(weighted_mean)) %>%
  pull()

coefs <- extract_glm_coefs(mod, only_sig=T)

The value of Tjur’s pseudo R2 for this model is 0.04. The table below shows the model coefficients.

Code
labels <- c(
  'Intercept',
  'Migration: Not born in the UK',
  "Ethnicity: Irish",
  "Ethnicity: Gypsy or Irish Traveller",
  "Ethnicity: Roma",
  "Ethnicity: Any other White background",
  "Ethnicity: White and Black Caribbean",
  "Ethnicity: White and Black African",
  "Ethnicity: White and Asian",
  "Ethnicity: Any other Mixed/Multiple ethnic background",
  "Ethnicity: Indian",
  "Ethnicity: Pakistani",
  "Ethnicity: Bangladeshi",
  "Ethnicity: Chinese",
  "Ethnicity: Any other Asian background",
  "Ethnicity: African",
  "Ethnicity: Caribbean",
  "Ethnicity: Any other Black, Black British, or Caribbean background",
  "Ethnicity: Arab",
  "Ethnicity: Any other ethnic group",
  "Ethnicity: Don't think of myself as any of these",
  "Ethnicity: Prefer not to say",
  'Age',
  'Gender: Male',
  'Gender: Other',
  'Gender: Prefer not to say',
  'Education: Has degree',
  "Education: Don't know",
  'Region: East Midlands',
  'Region: East of England',
  'Region: North East',
  'Region: North West',
  'Region: Northern Ireland',
  'Region: Scotland',
  'Region: South East',
  'Region: South West',
  'Region: Wales',
  'Region: West Midlands',
  'Region: Yorkshire and the Humber'
)

tab_model(mod, 
          pred.labels = labels, 
          dv.labels = "Outsourcing",
          show.r2 = FALSE)
  Outsourcing
Predictors Odds Ratios CI p
Intercept 0.48 0.38 – 0.62 <0.001
Migration: Not born in the UK 1.49 1.24 – 1.79 <0.001
Ethnicity: Irish 0.76 0.41 – 1.32 0.349
Ethnicity: Gypsy or Irish Traveller 1.05 0.19 – 3.91 0.949
Ethnicity: Roma 1.14 0.18 – 5.08 0.872
Ethnicity: Any other White background 0.83 0.62 – 1.10 0.201
Ethnicity: White and Black Caribbean 0.48 0.18 – 1.08 0.108
Ethnicity: White and Black African 2.58 1.27 – 5.13 0.007
Ethnicity: White and Asian 1.23 0.55 – 2.50 0.590
Ethnicity: Any other Mixed/Multiple ethnic background 1.82 0.95 – 3.33 0.059
Ethnicity: Indian 1.14 0.85 – 1.51 0.389
Ethnicity: Pakistani 2.14 1.49 – 3.04 <0.001
Ethnicity: Bangladeshi 1.25 0.74 – 2.06 0.393
Ethnicity: Chinese 0.61 0.36 – 0.97 0.046
Ethnicity: Any other Asian background 1.22 0.83 – 1.78 0.303
Ethnicity: African 1.47 1.06 – 2.04 0.020
Ethnicity: Caribbean 1.25 0.67 – 2.18 0.460
Ethnicity: Any other Black, Black British, or Caribbean background 1.73 0.82 – 3.45 0.132
Ethnicity: Arab 2.07 0.96 – 4.26 0.053
Ethnicity: Any other ethnic group 1.06 0.41 – 2.38 0.902
Ethnicity: Don't think of myself as any of these 2.31 0.46 – 9.27 0.257
Ethnicity: Prefer not to say 1.25 0.48 – 2.86 0.622
Age 0.98 0.97 – 0.98 <0.001
Gender: Male 1.44 1.29 – 1.61 <0.001
Gender: Other 1.27 0.26 – 4.27 0.728
Gender: Prefer not to say 1.14 0.34 – 2.94 0.812
Education: Has degree 0.95 0.85 – 1.06 0.375
Education: Don't know 1.11 0.71 – 1.68 0.632
Region: East Midlands 0.93 0.74 – 1.17 0.534
Region: East of England 0.57 0.44 – 0.72 <0.001
Region: North East 0.61 0.44 – 0.84 0.003
Region: North West 0.81 0.65 – 1.00 0.053
Region: Northern Ireland 0.72 0.48 – 1.06 0.102
Region: Scotland 0.66 0.52 – 0.84 0.001
Region: South East 0.60 0.49 – 0.74 <0.001
Region: South West 0.66 0.52 – 0.84 0.001
Region: Wales 0.88 0.66 – 1.16 0.357
Region: West Midlands 0.80 0.64 – 1.00 0.055
Region: Yorkshire and the Humber 0.65 0.51 – 0.82 <0.001
Observations 9812
Code
mod <- glm(outsourcing_status ~ BORNUK_binary * Ethnicity_collapsed + Age + Gender + Has_Degree + Region, data, weights = NatRepemployees, family="quasibinomial")
summary(mod)

tjur <- data %>%
  #filter(!is.na(Ethnicity_collapsed_disaggregated)) %>%
  mutate(pred = predict(mod, type = "response")) %>%
  group_by(outsourcing_status) %>%
  summarise(weighted_mean = weighted.mean(pred, NatRepemployees)) %>%
  summarise(tjur_r2 = diff(weighted_mean)) %>%
  pull()

coefs <- extract_glm_coefs(mod, only_sig=T)

For model 1, the value of Tjur’s pseudo R2 is 0.038. The table below shows the model coefficients.

Code
labels <- c(
  'Intercept',
  'Migration: Not born in the UK',
  'Ethnicity: Arab/British Arab',
  'Ethnicity: Asian/Asian British',
  'Ethnicity: Black/African/Caribbean/Black British',
  'Ethnicity: Mixed/Multiple ethnic group',
  'Ethnicity: Other ethnic group',
  'Ethnicity: Prefer not to say',
  'Ethnicity: White other',
  'Age',
  'Gender: Male',
  'Gender: Other',
  'Gender: Prefer not to say',
  'Education: Has degree',
  "Education: Don't know",
  'Region: East Midlands',
  'Region: East of England',
  'Region: North East',
  'Region: North West',
  'Region: Northern Ireland',
  'Region: Scotland',
  'Region: South East',
  'Region: South West',
  'Region: Wales',
  'Region: West Midlands',
  'Region: Yorkshire and the Humber',
  'Interaction: Not born in UK x Arab/Arab British',
  'Interaction: Not born in UK x Asian/Asian British',
  'Interaction: Not born in UK x Black/African/Caribbean/Black British',
  'Interaction: Not born in UK x Mixed/Multiple ethnic group',
  'Interaction: Not born in UK x Other ethnic group',
  'Interaction: Not born in UK x Prefer not to say',
  'Interaction: Not born in UK x White other'
  
)

tab_model(mod, 
          pred.labels = labels, 
          dv.labels = "Outsourcing",
          show.r2 = FALSE)
  Outsourcing
Predictors Odds Ratios CI p
Intercept 0.46 0.36 – 0.59 <0.001
Migration: Not born in the UK 2.16 1.65 – 2.80 <0.001
Ethnicity: Arab/British Arab 1.89 0.49 – 6.14 0.310
Ethnicity: Asian/Asian British 1.50 1.17 – 1.90 0.001
Ethnicity: Black/African/Caribbean/Black British 1.55 1.01 – 2.32 0.038
Ethnicity: Mixed/Multiple ethnic group 1.17 0.75 – 1.78 0.470
Ethnicity: Other ethnic group 3.36 1.15 – 9.70 0.023
Ethnicity: Prefer not to say 1.21 0.85 – 1.68 0.268
Ethnicity: White other 1.02 0.63 – 1.58 0.938
Age 0.98 0.98 – 0.98 <0.001
Gender: Male 1.41 1.26 – 1.57 <0.001
Gender: Other 1.15 0.23 – 3.80 0.842
Gender: Prefer not to say 0.69 0.29 – 1.43 0.354
Education: Has degree 0.91 0.82 – 1.02 0.118
Education: Don't know 0.94 0.63 – 1.39 0.777
Region: East Midlands 0.93 0.74 – 1.16 0.510
Region: East of England 0.60 0.47 – 0.76 <0.001
Region: North East 0.64 0.46 – 0.87 0.006
Region: North West 0.86 0.70 – 1.06 0.154
Region: Northern Ireland 0.80 0.55 – 1.14 0.230
Region: Scotland 0.71 0.56 – 0.89 0.004
Region: South East 0.62 0.50 – 0.76 <0.001
Region: South West 0.69 0.54 – 0.87 0.002
Region: Wales 0.90 0.68 – 1.18 0.435
Region: West Midlands 0.84 0.67 – 1.04 0.116
Region: Yorkshire and the Humber 0.67 0.53 – 0.85 0.001
Interaction: Not born in UK x Arab/Arab British 0.86 0.19 – 4.34 0.846
Interaction: Not born in UK x Asian/Asian British 0.51 0.34 – 0.77 0.001
Interaction: Not born in UK x Black/African/Caribbean/Black British 0.67 0.38 – 1.20 0.176
Interaction: Not born in UK x Mixed/Multiple ethnic group 1.19 0.55 – 2.60 0.655
Interaction: Not born in UK x Other ethnic group 0.00 NA – 0.43 0.924
Interaction: Not born in UK x Prefer not to say 0.82 0.45 – 1.49 0.516
Interaction: Not born in UK x White other 0.56 0.32 – 1.00 0.044
Observations 10155
Code
ems <- emmeans(mod, specs = "Ethnicity_collapsed", by = "BORNUK_binary")
cons <- summary(contrast(ems, "pairwise",adjust="tukey"))
sig_cons <- cons %>% filter(p.value < .05) %>%
  mutate(
    or = 1 / exp(estimate), .after=estimate # 1 / or because we want to express comparison - white(ref) (contrast expresses white(ref) - comparison)
  )

To explore the interaction we calculated estimated marginal means of the ethnicity x migration effect. Examining the effect of ethnicity within each level of migration, we find that:

  1. Among people born in the UK, Asian/Asian British people are 1.5 times more likely to be outsourced than White British people.
  2. Among people not born in the UK, White other workers are 0.57 as likely to be outsourced than White British people.
  3. Among people not born in the UK, White other workers are 0.54 as likely to be outsourced than Black/African/Caribbean/Black British people.

The plot below visualises these effects.

Code
sjPlot::plot_model(mod, type = "pred", legend.title="", terms = c("BORNUK_binary","Ethnicity_collapsed"), dodge=0.5) +#
  
  coord_flip() +
  xlab("") + ylab("Likelihood of being outsourced") +
  theme_minimal() 

Code
ems <- emmeans(mod, specs = "BORNUK_binary", by = "Ethnicity_collapsed")
cons <- summary(contrast(ems, "pairwise",adjust="tukey"))
sig_cons <- cons %>% filter(p.value < .05) %>%
  mutate(
    or = 1 / exp(estimate), .after=estimate # 1 / or because we want to express comparison - white(ref) (contrast expresses white(ref) - comparison)
  )

Examining the effect of migration for particular ethnicities, we find that:

  1. Among White British people, people not born in the UK are 2.16 more likely to be outsourced than people born in the UK
  2. Among people of Mixed/Multiple ethnic group, people not born in the UK are 2.57 more likely to be outsourced than people born in the UK
  3. Among people who preferred not to say their ethnicity, people not born in the UK are 1.77 more likely to be outsourced than people born in the UK

The plot below visualises these effects.

Code
sjPlot::plot_model(mod, type = "pred", legend.title="", terms = c("Ethnicity_collapsed","BORNUK_binary"), dodge=0.5) +#
  
  coord_flip() +
  xlab("") + ylab("Likelihood of being outsourced") +
  theme_minimal() 

Code
mod <- glm(outsourcing_status ~ BORNUK_binary * Ethnicity_collapsed_disaggregated + Age + Gender + Has_Degree + Region, data, weights = NatRepemployees, family="quasibinomial")
summary(mod)

tjur <- data %>%
  filter(!is.na(Ethnicity_collapsed_disaggregated)) %>%
  mutate(pred = predict(mod, type = "response")) %>%
  group_by(outsourcing_status) %>%
  summarise(weighted_mean = weighted.mean(pred, NatRepemployees)) %>%
  summarise(tjur_r2 = diff(weighted_mean)) %>%
  pull()

coefs <- extract_glm_coefs(mod, only_sig=T)
coefs_all <- extract_glm_coefs(mod)

For model 2, the value of Tjur’s pseudo R2 is 0.045. The table below shows the model coefficients.

Code
labels <- c(
  'Intercept',
  'Migration: Not born in the UK',
  "Ethnicity: Irish",
  "Ethnicity: Gypsy or Irish Traveller",
  "Ethnicity: Roma",
  "Ethnicity: Any other White background",
  "Ethnicity: White and Black Caribbean",
  "Ethnicity: White and Black African",
  "Ethnicity: White and Asian",
  "Ethnicity: Any other Mixed/Multiple ethnic background",
  "Ethnicity: Indian",
  "Ethnicity: Pakistani",
  "Ethnicity: Bangladeshi",
  "Ethnicity: Chinese",
  "Ethnicity: Any other Asian background",
  "Ethnicity: African",
  "Ethnicity: Caribbean",
  "Ethnicity: Any other Black, Black British, or Caribbean background",
  "Ethnicity: Arab",
  "Ethnicity: Any other ethnic group",
  "Ethnicity: Don't think of myself as any of these",
  "Ethnicity: Prefer not to say",
  'Age',
  'Gender: Male',
  'Gender: Other',
  'Gender: Prefer not to say',
  'Education: Has degree',
  "Education: Don't know",
  'Region: East Midlands',
  'Region: East of England',
  'Region: North East',
  'Region: North West',
  'Region: Northern Ireland',
  'Region: Scotland',
  'Region: South East',
  'Region: South West',
  'Region: Wales',
  'Region: West Midlands',
  'Region: Yorkshire and the Humber',
  "Interaction: Not born in UK x Irish",
  "Interaction: Not born in UK x Gypsy or Irish Traveller",
  # "Interaction: Not born in UK x Roma",
  "Interaction: Not born in UK x Any other White background",
  "Interaction: Not born in UK x White and Black Caribbean",
  "Interaction: Not born in UK x White and Black African",
  "Interaction: Not born in UK x White and Asian",
  "Interaction: Not born in UK x Any other Mixed/Multiple ethnic background",
  "Interaction: Not born in UK x Indian",
  "Interaction: Not born in UK x Pakistani",
  "Interaction: Not born in UK x Bangladeshi",
  "Interaction: Not born in UK x Chinese",
  "Interaction: Not born in UK x Any other Asian background",
  "Interaction: Not born in UK x African",
  "Interaction: Not born in UK x Caribbean",
  "Interaction: Not born in UK x Any other Black, Black British, or Caribbean background",
  "Interaction: Not born in UK x Arab",
  "Interaction: Not born in UK x Any other ethnic group",
  "Interaction: Not born in UK x Don't think of myself as any of these",
  "Interaction: Not born in UK x Prefer not to say"

  
)

tab_model(mod, 
          pred.labels = labels,
          dv.labels = "Outsourcing",
          show.r2 = FALSE)
  Outsourcing
Predictors Odds Ratios CI p
Intercept 0.45 0.35 – 0.58 <0.001
Migration: Not born in the UK 2.12 1.61 – 2.75 <0.001
Ethnicity: Irish 0.92 0.46 – 1.72 0.810
Ethnicity: Gypsy or Irish Traveller 1.67 0.30 – 6.98 0.505
Ethnicity: Roma 0.82 0.13 – 3.71 0.811
Ethnicity: Any other White background 1.11 0.53 – 2.14 0.757
Ethnicity: White and Black Caribbean 0.53 0.19 – 1.19 0.166
Ethnicity: White and Black African 3.39 1.26 – 8.76 0.012
Ethnicity: White and Asian 0.90 0.32 – 2.12 0.830
Ethnicity: Any other Mixed/Multiple ethnic background 1.88 0.78 – 4.13 0.135
Ethnicity: Indian 1.32 0.90 – 1.91 0.143
Ethnicity: Pakistani 2.68 1.73 – 4.08 <0.001
Ethnicity: Bangladeshi 1.81 0.90 – 3.49 0.083
Ethnicity: Chinese 0.54 0.20 – 1.16 0.152
Ethnicity: Any other Asian background 1.06 0.44 – 2.28 0.882
Ethnicity: African 1.53 0.77 – 2.84 0.200
Ethnicity: Caribbean 1.12 0.53 – 2.16 0.747
Ethnicity: Any other Black, Black British, or Caribbean background 2.62 1.10 – 5.90 0.023
Ethnicity: Arab 1.86 0.48 – 6.05 0.324
Ethnicity: Any other ethnic group 3.36 1.15 – 9.73 0.023
Ethnicity: Don't think of myself as any of these 5.42 0.63 – 41.39 0.090
Ethnicity: Prefer not to say 1.99 0.65 – 5.17 0.183
Age 0.98 0.98 – 0.98 <0.001
Gender: Male 1.44 1.29 – 1.61 <0.001
Gender: Other 1.26 0.25 – 4.26 0.736
Gender: Prefer not to say 1.08 0.32 – 2.82 0.888
Education: Has degree 0.95 0.84 – 1.06 0.339
Education: Don't know 1.11 0.72 – 1.69 0.620
Region: East Midlands 0.95 0.75 – 1.20 0.690
Region: East of England 0.59 0.47 – 0.75 <0.001
Region: North East 0.63 0.45 – 0.87 0.007
Region: North West 0.84 0.68 – 1.05 0.122
Region: Northern Ireland 0.72 0.48 – 1.06 0.109
Region: Scotland 0.69 0.54 – 0.88 0.003
Region: South East 0.62 0.51 – 0.77 <0.001
Region: South West 0.69 0.54 – 0.87 0.002
Region: Wales 0.91 0.68 – 1.21 0.521
Region: West Midlands 0.81 0.65 – 1.01 0.068
Region: Yorkshire and the Humber 0.67 0.52 – 0.86 0.001
Interaction: Not born in UK x Irish 0.37 0.08 – 1.36 0.163
Interaction: Not born in UK x Gypsy or Irish Traveller 0.00 NA – 96204611106190144.00 0.970
Interaction: Not born in UK x Any other White background 0.52 0.25 – 1.19 0.102
Interaction: Not born in UK x White and Black Caribbean 0.00 NA – 490664849159.12 0.968
Interaction: Not born in UK x White and Black African 0.43 0.10 – 1.73 0.231
Interaction: Not born in UK x White and Asian 2.80 0.47 – 18.40 0.259
Interaction: Not born in UK x Any other Mixed/Multiple ethnic background 0.72 0.20 – 2.55 0.602
Interaction: Not born in UK x Indian 0.55 0.31 – 1.00 0.051
Interaction: Not born in UK x Pakistani 0.42 0.19 – 0.89 0.024
Interaction: Not born in UK x Bangladeshi 0.35 0.12 – 0.98 0.047
Interaction: Not born in UK x Chinese 0.90 0.33 – 2.74 0.838
Interaction: Not born in UK x Any other Asian background 0.87 0.35 – 2.35 0.774
Interaction: Not born in UK x African 0.70 0.33 – 1.54 0.362
Interaction: Not born in UK x Caribbean 1.19 0.31 – 4.35 0.793
Interaction: Not born in UK x Any other Black, Black British, or Caribbean background 0.20 0.03 – 0.98 0.062
Interaction: Not born in UK x Arab 0.87 0.19 – 4.43 0.864
Interaction: Not born in UK x Any other ethnic group 0.00 NA – 0.43 0.924
Interaction: Not born in UK x Don't think of myself as any of these 0.13 0.00 – 2.46 0.186
Interaction: Not born in UK x Prefer not to say 0.19 0.02 – 1.24 0.102
Observations 9812
Code
ems <- emmeans(mod, specs = "Ethnicity_collapsed_disaggregated", by = "BORNUK_binary")
cons <- summary(contrast(ems, "pairwise",adjust="tukey"))
sig_cons <- cons %>% filter(p.value < .05) %>%
  mutate(
    or = 1 / exp(estimate), .after=estimate # 1 / or because we want to express comparison - white(ref) (contrast expresses white(ref) - comparison)
  )

Again for this model we calculated estimated marginal means of the ethnicity x migration effect. Examining the effect of ethnicity within each level of migration, we find that:

  1. Among people born in the UK, Pakistani workers are 2.68 times more likely to be outsourced than English/Welsh/Scottish/Northern Irish/British workers.

No other differences were significant. The plot below visualises these effects.

Code
sjPlot::plot_model(mod, type = "pred", legend.title="", terms = c("BORNUK_binary","Ethnicity_collapsed_disaggregated"), dodge=0.5) +#
  
  coord_flip() +
  xlab("") + ylab("Likelihood of being outsourced") +
  theme_minimal() +
  theme(
    legend.position = "none"
  )

Code
ems <- emmeans(mod, specs = "BORNUK_binary", by = "Ethnicity_collapsed_disaggregated")
cons <- summary(contrast(ems, "pairwise",adjust="tukey"))
sig_cons <- cons %>% filter(p.value < .05) %>%
  mutate(
    or = 1 / exp(estimate), .after=estimate # 1 / or because we want to express comparison - white(ref) (contrast expresses white(ref) - comparison)
  )

Examining the effect of migration for particular ethnicities, we find that:

  1. Among English/Welsh/Scottish/Northern Irish/British workers, people not born in the UK are 2.12 more likely to be outsourced than people born in the UK
  2. Among Mixed White and Asian workers, people not born in the UK are 5.93 more likely to be outsourced than people born in the UK (NB n < 50 for this group).

No other differences were significant. The plot below visualises these findings.

Code
sjPlot::plot_model(mod, type = "pred", legend.title="", terms = c("Ethnicity_collapsed_disaggregated","BORNUK_binary"), dodge=0.5) +#
  
  coord_flip() +
  xlab("") + ylab("Likelihood of being outsourced") +
  theme_minimal() 

5 Outsourced workers are more likely to be men, but women are more likely to be low-paid outsourced workers

Code
mod <- glm(outsourcing_status ~ BORNUK_binary + Ethnicity_collapsed_disaggregated + Age + Gender + Has_Degree + Region, data, weights = NatRepemployees, family="quasibinomial")
summary(mod)

coefs <- extract_glm_coefs(mod, only_sig=T)

gender_coefs <- coefs %>%
  filter(variable == "GenderMale")

The same model as the previous was used to estimate the effect of Gender on outsourcing, and indicates that men are 1.44 times more likely to be outsourced than women.

Code
# Annual income
# Intercept only
mod_base <- lm(income_annual_all ~ 1, income_data, weights = NatRepemployees)
# H1
mod_annual <- lm(income_annual_all ~ Age + Gender + Has_Degree + Ethnicity_collapsed + Region + outsourcing_status + BORNUK_labelled, income_data, weights = NatRepemployees)

summary(mod_annual)

# F and p
f_annual <- round(anova(mod_base, mod_annual)[2,"F"],2)
p_annual <- anova(mod_base, mod_annual)[2,"Pr(>F)"]
if(p_annual < .001){
  p_annual = "< .001"
} else{
  p_annual = paste0("= ",round(p_annual,3))
}

# Degrees of freedom
dfs_annual <- as.list(anova(mod_base, mod_annual)[2,c("Df","Res.Df")])
# R2
rsquare_annual <- round(summary(mod_annual)$r.squared,2)

# Weekly income
# Intercept only
mod_base <- lm(income_weekly_all ~ 1, income_data, weights = NatRepemployees)
# H1
mod_weekly <- lm(income_weekly_all ~ Age + Gender + Has_Degree + Ethnicity_collapsed + Region + outsourcing_status + BORNUK_labelled, income_data, weights = NatRepemployees)
summary(mod_weekly)

# F and p
f_weekly <- round(anova(mod_base, mod_weekly)[2,"F"],2)
p_weekly <- anova(mod_base, mod_weekly)[2,"Pr(>F)"]
if(p_weekly < .001){
  p_weekly = "< .001"
} else{
  p_weekly = paste0("= ",round(p_weekly,3))
}

# Degrees of freedom
dfs_weekly <- as.list(anova(mod_base, mod_weekly)[2,c("Df","Res.Df")])
# R2
rsquare_weekly <- round(summary(mod_weekly)$r.squared,2)
Code
annual_gender_coef <- extract_lm_coefs(mod_annual, only_sig = T) %>%
  filter(variable == "GenderMale") 

weekly_gender_coef <- extract_lm_coefs(mod_weekly, only_sig = T) %>%
  filter(variable == "GenderMale") 

Exploring a possible gender pay gap, as shown in Table 1 and Table 2 there is a significant difference in pay between men and women. Annually, men earn £7027.53 more than women. Per week, men earn £150.99 more than women.

We next explored whether outsourcing compounds this gender pay gap by adding an outsourcing:gender interaction term into the previous models.

Code
# Annual income

# H1
mod_annual_int <- lm(income_annual_all ~ Age + Gender + Has_Degree + Ethnicity_collapsed + Region + outsourcing_status + BORNUK_labelled + Gender:outsourcing_status, income_data, weights = NatRepemployees)

summary(mod_annual_int)

# F and p
anova_test <- anova(mod_annual, mod_annual_int)
f_annual <- anova_test[2,"F"] %>% round(2)
p_annual <- anova_test[2,"Pr(>F)"]
if(p_annual < .001){
  p_annual = "< .001"
} else{
  p_annual = paste0("= ",round(p_annual,3))
}

# Degrees of freedom
dfs_annual <- as.list(anova_test[2,c("Df","Res.Df")])
# R2
rsquare_annual <- round(summary(mod_annual_int)$r.squared,2)

# Weekly income

# H1
mod_weekly_int <- lm(income_weekly_all ~ Age + Gender + Has_Degree + Ethnicity_collapsed + Region + outsourcing_status + BORNUK_labelled + Gender:outsourcing_status, income_data, weights = NatRepemployees)
summary(mod_weekly_int)

# F and p
anova_test <- anova(mod_weekly, mod_weekly_int)

f_weekly <-  anova_test[2,"F"] %>% round(2)
p_weekly <- anova_test[2,"Pr(>F)"]
if(p_weekly < .001){
  p_weekly = "< .001"
} else{
  p_weekly = paste0("= ", round(p_weekly,3))
}

# Degrees of freedom
dfs_weekly <- as.list(anova_test[2,c("Df","Res.Df")])
# R2
rsquare_weekly <- round(summary(mod_weekly_int)$r.squared,2)

For both models, adding the interaction effect did not improve model fit (R2 = 0.18, F(3, 8253) = 0.76, p = 0.514). The tables below show the coefficients for each model.

Code
labels <- c(
  'Intercept',
  'Age',
  'Gender: Male',
  'Gender: Other',
  'Gender: Prefer not to say',
  'Education: Has degree',
  "Education: Don't know",
  'Ethnicity: Arab/British Arab',
  'Ethnicity: Asian/Asian British',
  'Ethnicity: Black/African/Caribbean/Black British',
  'Ethnicity: Mixed/Multiple ethnic group',
  'Ethnicity: Other ethnic group',
  'Ethnicity: Prefer not to say',
  'Ethnicity: White other',
  'Region: East Midlands',
  'Region: East of England',
  'Region: North East',
  'Region: North West',
  'Region: Northern Ireland',
  'Region: Scotland',
  'Region: South East',
  'Region: South West',
  'Region: Wales',
  'Region: West Midlands',
  'Region: Yorkshire and the Humber',
  'Outsourcing: Outsourced',
  'Migration: Arrived within the last year',
  'Migration: Arrived within the last 3 years',
  'Migration: Arrived within the last 5 years',
  'Migration: Arrived within the last 10 years',
  'Migration: Arrived within the last 15 years',
  'Migration: Arrived within the last 20 years',
  'Migration: Arrived within the last 30 years',
  'Migration: Arrived more than 30 years ago',
  'Migration: Prefer not to say',
  'Interaction: Outsourcing x Gender Male',
  'Interaction: Outsourcing x Gender Other',
  'Interaction: Outsourcing x Gender Prefer not to say'
)
  
tab_model(mod_annual_int, pred.labels = labels, dv.labels = "Annual income")
Table 4: Linear regression predicting annual income with outsourcing:gender interaction term
  Annual income
Predictors Estimates CI p
Intercept 23921.57 22660.06 – 25183.08 <0.001
Age 14.52 -6.10 – 35.14 0.167
Gender: Male 6994.01 6417.97 – 7570.04 <0.001
Gender: Other 3525.48 -4019.27 – 11070.23 0.360
Gender: Prefer not to say 4366.44 -1326.82 – 10059.69 0.133
Education: Has degree 8190.56 7643.58 – 8737.54 <0.001
Education: Don't know -1963.22 -4091.44 – 165.00 0.071
Ethnicity: Arab/British Arab -172.85 -4863.98 – 4518.28 0.942
Ethnicity: Asian/Asian British -466.43 -1560.35 – 627.49 0.403
Ethnicity: Black/African/Caribbean/Black British -1176.65 -2780.51 – 427.21 0.150
Ethnicity: Mixed/Multiple ethnic group -1514.57 -3493.64 – 464.51 0.134
Ethnicity: Other ethnic group 3570.13 -1022.61 – 8162.86 0.128
Ethnicity: Prefer not to say -243.51 -1963.50 – 1476.48 0.781
Ethnicity: White other -586.40 -1950.14 – 777.34 0.399
Region: East Midlands -5792.15 -7008.07 – -4576.23 <0.001
Region: East of England -4101.18 -5244.25 – -2958.10 <0.001
Region: North East -4877.65 -6407.08 – -3348.23 <0.001
Region: North West -4501.54 -5608.43 – -3394.65 <0.001
Region: Northern Ireland -6563.67 -8340.41 – -4786.94 <0.001
Region: Scotland -5489.92 -6678.61 – -4301.23 <0.001
Region: South East -3433.17 -4473.52 – -2392.82 <0.001
Region: South West -5704.35 -6891.22 – -4517.48 <0.001
Region: Wales -5388.98 -6835.17 – -3942.79 <0.001
Region: West Midlands -5021.92 -6178.63 – -3865.21 <0.001
Region: Yorkshire and the Humber -5547.51 -6730.41 – -4364.60 <0.001
Outsourcing: Outsourced -3031.29 -4081.44 – -1981.15 <0.001
Migration: Arrived within the last year -6140.84 -8382.45 – -3899.24 <0.001
Migration: Arrived within the last 3 years -2391.65 -4414.37 – -368.92 0.020
Migration: Arrived within the last 5 years -2057.10 -4287.24 – 173.05 0.071
Migration: Arrived within the last 10 years -637.25 -2383.00 – 1108.51 0.474
Migration: Arrived within the last 15 years 718.11 -1222.02 – 2658.23 0.468
Migration: Arrived within the last 20 years 1305.40 -778.66 – 3389.46 0.220
Migration: Arrived within the last 30 years 3486.00 1066.44 – 5905.57 0.005
Migration: Arrived more than 30 years ago -194.33 -2114.72 – 1726.06 0.843
Migration: Prefer not to say -2115.89 -5649.21 – 1417.43 0.240
Interaction: Outsourcing x Gender Male 192.17 -1217.63 – 1601.98 0.789
Interaction: Outsourcing x Gender Other -12336.38 -28857.14 – 4184.38 0.143
Interaction: Outsourcing x Gender Prefer not to say -1318.07 -15431.44 – 12795.30 0.855
Observations 8291
R2 / R2 adjusted 0.184 / 0.180
Code
tab_model(mod_weekly_int, pred.labels = labels, dv.labels = "Weekly income")
Table 5: Linear regression predicting weekly income with outsourcing:gender interaction term
  Weekly income
Predictors Estimates CI p
Intercept 513.97 486.86 – 541.07 <0.001
Age 0.31 -0.13 – 0.76 0.167
Gender: Male 150.27 137.89 – 162.65 <0.001
Gender: Other 75.75 -86.36 – 237.85 0.360
Gender: Prefer not to say 93.82 -28.51 – 216.14 0.133
Education: Has degree 175.98 164.23 – 187.73 <0.001
Education: Don't know -42.18 -87.91 – 3.55 0.071
Ethnicity: Arab/British Arab -3.71 -104.51 – 97.08 0.942
Ethnicity: Asian/Asian British -10.02 -33.53 – 13.48 0.403
Ethnicity: Black/African/Caribbean/Black British -25.28 -59.74 – 9.18 0.150
Ethnicity: Mixed/Multiple ethnic group -32.54 -75.06 – 9.98 0.134
Ethnicity: Other ethnic group 76.71 -21.97 – 175.38 0.128
Ethnicity: Prefer not to say -5.23 -42.19 – 31.72 0.781
Ethnicity: White other -12.60 -41.90 – 16.70 0.399
Region: East Midlands -124.45 -150.57 – -98.32 <0.001
Region: East of England -88.12 -112.68 – -63.56 <0.001
Region: North East -104.80 -137.66 – -71.94 <0.001
Region: North West -96.72 -120.50 – -72.94 <0.001
Region: Northern Ireland -141.02 -179.20 – -102.85 <0.001
Region: Scotland -117.95 -143.49 – -92.41 <0.001
Region: South East -73.76 -96.12 – -51.41 <0.001
Region: South West -122.56 -148.06 – -97.06 <0.001
Region: Wales -115.79 -146.86 – -84.71 <0.001
Region: West Midlands -107.90 -132.75 – -83.05 <0.001
Region: Yorkshire and the Humber -119.19 -144.61 – -93.78 <0.001
Outsourcing: Outsourced -65.13 -87.69 – -42.57 <0.001
Migration: Arrived within the last year -131.94 -180.10 – -83.78 <0.001
Migration: Arrived within the last 3 years -51.39 -94.85 – -7.93 0.020
Migration: Arrived within the last 5 years -44.20 -92.11 – 3.72 0.071
Migration: Arrived within the last 10 years -13.69 -51.20 – 23.82 0.474
Migration: Arrived within the last 15 years 15.43 -26.26 – 57.11 0.468
Migration: Arrived within the last 20 years 28.05 -16.73 – 72.82 0.220
Migration: Arrived within the last 30 years 74.90 22.91 – 126.88 0.005
Migration: Arrived more than 30 years ago -4.18 -45.44 – 37.09 0.843
Migration: Prefer not to say -45.46 -121.38 – 30.45 0.240
Interaction: Outsourcing x Gender Male 4.13 -26.16 – 34.42 0.789
Interaction: Outsourcing x Gender Other -265.05 -620.01 – 89.90 0.143
Interaction: Outsourcing x Gender Prefer not to say -28.32 -331.55 – 274.91 0.855
Observations 8291
R2 / R2 adjusted 0.184 / 0.180
Code
ems <- emmeans(mod_annual_int, specs = "Gender", by = "outsourcing_status", nuisance = "BORNUK_labelled")
cons <- summary(contrast(ems, "pairwise",adjust="tukey"))
sig_cons <- cons %>% filter(p.value < .05)
Code
ems <- emmeans(mod_annual_int, specs = "outsourcing_status", by = "Gender", nuisance = "BORNUK_labelled")
cons <- summary(contrast(ems, "pairwise",adjust="tukey"))
sig_cons2 <- cons %>% filter(p.value < .05) 

The interaction term is non-significant. Estimated marginal means show that:

  • Among not outsourced workers, men are paid £6994.01 more than women
  • Among outsourced workers, men are paid £7186.18 more than women
  • Among men, not outsourced workers are paid £2839.12 more than outsourced workers.
  • Among women, not outsourced workers are paid £3031.29 more than outsourced workers.

The plot below illustrates the main effects that men are paid more than women and that outsourced men and women are paid less than non-outsourced men and women. The lack of interaction indicates that the difference in pay between men and women does not significantly differ between outsourced and non-outsourced people.

Code
sjPlot::plot_model(mod_annual_int, type = "pred", legend.title="", terms = c("outsourcing_status","Gender"), dodge=0.5) +#
  
  coord_flip() +
  xlab("") + ylab("Likelihood of being outsourced") +
  theme_minimal() 

6 Age

Code
mod <- glm(outsourcing_status ~ BORNUK_binary + Ethnicity_collapsed_disaggregated + Age + Gender + Has_Degree + Region, data, weights = NatRepemployees, family="quasibinomial")
summary(mod)

coefs <- extract_glm_coefs(mod, only_sig=T)

age_coefs <- coefs %>%
  filter(variable == "Age")

We found that age was a significant predictor of the likelihood of being outsourced. The model indicates that for each year older a worker is, they are 0.98 times as likely (i.e. 2% less likely) to be outsourced.