JRF Nat Rep - Data cleaning

Authors

Jolyon Miles-Wilson

Celestin Okoroji

Published

May 1, 2025

Code

rm(list = ls())
# Load libraries
library(gridExtra)
library(skimr)
library(haven)
library(tidyverse)
library(wesanderson)
library(Hmisc)
colours <- wes_palette("GrandBudapest2",4,"discrete")

Resolving the two datasets received from Opinium

We received two datasets on two separate occasions from Opinium. The second contains occupation and sector data that the first didn’t include. The first thing we need to do is understand the differences between these two datasets to determine whether we can simply choose one dataset, and which one this should be.

TLDR

The second dataset is an updated version of the first. Although there are some variables that are present in one dataset but not in the other, investigating these differences indicates that there is no problem with simply using the second dataset instead of the first. Read on for details on the steps taken to determine this.

Code

# Read the first data we received from opinium
data <- read_sav("../Data/UK23626 Workers sample data with nat rep and graduates weight.sav")
# View(data)

# read the second data we received from opinium. This includes more job/sector information
data_2 <- read_sav("../Data/JRF Outsourced Workers - Occupations and Sectors.sav")
# skim(data_2)

Code

# Investigate the differences between the two datasets
check_columns <- function(data1, data2){
  variables_absent <- c()
  total <- 0
  cols1 <- colnames(data1)
  cols2 <- colnames(data2)
  
  # For each item in cols1, check whether it exists in cols2
  for(x in seq_along(cols1)){
    this_var = cols1[x]
    present = this_var %in% cols2
    total = total + present 
    missing = length(cols1) - total
    # Create vector of all missing vars
    if(!present){
      variables_absent = append(variables_absent, this_var)
    }
  }
  cat(paste0(missing, " columns in ", deparse(substitute(data1)), " missing from ", deparse(substitute(data2)),":"))
  cat(paste0("\n",variables_absent))
  cat("\n")
  return(variables_absent)
}

vars_absent1 <- check_columns(data, data_2)

11 columns in data missing from data_2:
StartTime 
Agegender_recode_useforquota 
Q3v1_1 
Q3v1_2 
Q3v1_3 
Q3v1_4 
Q3v1_5 
Q3v2_1 
Q3v2_2 
Q3v4a 
Q3v4b

Code

vars_absent2 <- check_columns(data_2, data)

9 columns in data_2 missing from data:
ed_bands 
unit_code 
UnitOccupation 
smg_code 
MajorsubgroupOccupation 
major_code 
Majorgroupcode 
SectorCode 
SectorName

Looking at the variables in data that are missing from data_2, all but two are completely missing (complete_rate = 0), and the remaining two (startTime and agegender_recode_useforquota) are not needed. This exercise indicates that we can just switch to using data_2 instead of data.

Code

vars_absent1_subset <- dplyr::select(data, dplyr::all_of(vars_absent1))

skim(vars_absent1_subset)

Data summary
Name	vars_absent1_subset
Number of rows	10155
Number of columns	11
_______________________
Column type frequency:
numeric	11
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
StartTime	0	1	152128.49	49331.93	4	125642	160423	185037.5	235940	▁▂▅▇▅
Agegender_recode_useforquota	0	1	5.54	2.72	1	3	6	8.0	11	▇▅▅▇▁
Q3v1_1	10155	0	NaN	NA	NA	NA	NA	NA	NA
Q3v1_2	10155	0	NaN	NA	NA	NA	NA	NA	NA
Q3v1_3	10155	0	NaN	NA	NA	NA	NA	NA	NA
Q3v1_4	10155	0	NaN	NA	NA	NA	NA	NA	NA
Q3v1_5	10155	0	NaN	NA	NA	NA	NA	NA	NA
Q3v2_1	10155	0	NaN	NA	NA	NA	NA	NA	NA
Q3v2_2	10155	0	NaN	NA	NA	NA	NA	NA	NA
Q3v4a	10155	0	NaN	NA	NA	NA	NA	NA	NA
Q3v4b	10155	0	NaN	NA	NA	NA	NA	NA	NA

Visual inspection of a skim of each dataset indicates they are the same.

Code

skim(data)

Data summary
Name	data
Number of rows	10155
Number of columns	88
_______________________
Column type frequency:
character	7
numeric	81
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	empty	n_unique
MIProRspId	1	36	36	0	10155
E6_5_other	1	0	102	10142	14
E8A	1	2	124	0	4946
E8B	1	2	274	0	6559
E9A	1	2	185	0	4369
E10B_9_other	1	0	75	9992	132
Q2_3_other	1	0	93	9988	140

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
StartTime	0	1.00	152128.49	49331.93	4.00	125642.00	160423.00	185037.50	235940.00	▁▂▅▇▅
D1_Gender_C	0	1.00	1.53	0.54	1.00	1.00	2.00	2.00	4.00	▇▇▁▁▁
D2_Age	0	1.00	42.15	13.03	16.00	32.00	42.00	53.00	81.00	▅▇▇▅▁
Agegender_recode_useforquota	0	1.00	5.54	2.72	1.00	3.00	6.00	8.00	11.00	▇▅▅▇▁
D4_Region_C	0	1.00	7.25	3.39	1.00	4.00	7.00	10.00	12.00	▃▅▃▃▇
D3_employment	0	1.00	1.24	0.45	1.00	1.00	1.00	1.00	3.00	▇▁▂▁▁
E2	0	1.00	1.00	0.00	1.00	1.00	1.00	1.00	1.00	▁▁▇▁▁
SCD_Opt_In_1	0	1.00	1.03	0.18	1.00	1.00	1.00	1.00	2.00	▇▁▁▁▁
SCD_Opt_In_2	0	1.00	1.07	0.26	1.00	1.00	1.00	1.00	2.00	▇▁▁▁▁
SCD_Opt_In_3	0	1.00	1.04	0.20	1.00	1.00	1.00	1.00	2.00	▇▁▁▁▁
Ethnicity	343	0.97	2.89	4.26	1.00	1.00	1.00	1.00	21.00	▇▁▁▁▁
D6_educ1	0	1.00	1.47	0.52	1.00	1.00	1.00	2.00	3.00	▇▁▆▁▁
D7_educ2_1	5527	0.46	0.56	0.50	0.00	0.00	1.00	1.00	1.00	▆▁▁▁▇
D7_educ2_2	5527	0.46	0.26	0.44	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▃
D7_educ2_3	5527	0.46	0.21	0.41	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
D7_educ2_4	5527	0.46	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D7_educ2_5	5527	0.46	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D7_educ2_6	5527	0.46	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D7_educ2_7	5527	0.46	0.22	0.41	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
D7_educ2_8	5527	0.46	0.15	0.36	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
D7_educ2_9	5527	0.46	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D7_educ2_10	5527	0.46	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D7_educ2_11	5527	0.46	0.10	0.30	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D7_educ2_12	5527	0.46	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D7_educ2_13	5527	0.46	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
TradeUnion	737	0.93	1.71	0.48	1.00	1.00	2.00	2.00	3.00	▃▁▇▁▁
D6_Children_1	0	1.00	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D6_Children_2	0	1.00	0.13	0.33	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D6_Children_3	0	1.00	0.10	0.30	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D6_Children_4	0	1.00	0.13	0.33	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D6_Children_5	0	1.00	0.10	0.30	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D6_Children_6	0	1.00	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D6_Children_7	0	1.00	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
D6_Children_8	0	1.00	0.26	0.44	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▃
D6_Children_9	0	1.00	0.39	0.49	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▅
E1	0	1.00	1.88	0.33	1.00	2.00	2.00	2.00	2.00	▁▁▁▁▇
E3	0	1.00	1.07	0.44	1.00	1.00	1.00	1.00	5.00	▇▁▁▁▁
E5	0	1.00	1.11	0.39	1.00	1.00	1.00	1.00	3.00	▇▁▁▁▁
E6_1	9658	0.05	0.31	0.46	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▃
E6_2	9658	0.05	0.60	0.49	0.00	0.00	1.00	1.00	1.00	▆▁▁▁▇
E6_3	9658	0.05	0.11	0.31	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
E6_4	9658	0.05	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
E6_5	9658	0.05	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
E7A	1	1.00	6.16	2.70	1.00	5.00	6.00	9.00	9.00	▃▂▂▆▇
E7B	0	1.00	1.61	0.49	1.00	1.00	2.00	2.00	2.00	▅▁▁▁▇
E9B	0	1.00	12.67	5.62	1.00	8.00	14.00	17.00	22.00	▂▅▃▇▃
E10A	0	1.00	1.33	0.47	1.00	1.00	1.00	2.00	2.00	▇▁▁▁▃
E10B	6827	0.33	4.55	2.03	1.00	3.00	4.00	6.00	9.00	▂▇▂▆▁
Q1_1	0	1.00	1.84	0.37	1.00	2.00	2.00	2.00	2.00	▂▁▁▁▇
Q1_2	0	1.00	1.81	0.39	1.00	2.00	2.00	2.00	2.00	▂▁▁▁▇
Q1_3	0	1.00	1.67	0.47	1.00	1.00	2.00	2.00	2.00	▃▁▁▁▇
Q1_4	0	1.00	1.79	0.41	1.00	2.00	2.00	2.00	2.00	▂▁▁▁▇
Q1_5	0	1.00	1.80	0.40	1.00	2.00	2.00	2.00	2.00	▂▁▁▁▇
Q1_6	0	1.00	1.71	0.46	1.00	1.00	2.00	2.00	2.00	▃▁▁▁▇
Q2	0	1.00	1.13	0.38	1.00	1.00	1.00	1.00	3.00	▇▁▁▁▁
Q3v3a	0	1.00	2.78	0.57	1.00	3.00	3.00	3.00	3.00	▁▁▁▁▇
Q3v3b	9383	0.08	2.48	0.82	1.00	2.00	3.00	3.00	3.00	▂▁▁▁▇
Q3v3c	9435	0.07	2.35	0.77	1.00	2.00	3.00	3.00	3.00	▃▁▅▁▇
Q3v3d	1492	0.85	2.90	0.41	1.00	3.00	3.00	3.00	3.00	▁▁▁▁▇
D7_Disability1	443	0.96	1.82	0.47	1.00	2.00	2.00	2.00	3.00	▂▁▇▁▁
D8_Disability2	8052	0.21	2.13	0.67	1.00	2.00	2.00	3.00	4.00	▂▇▁▃▁
INCOME_FREQ	0	1.00	1.76	0.85	1.00	1.00	2.00	2.00	4.00	▇▇▁▂▁
INCOME_OPEN_1	2656	0.74	21132.05	29878.94	6.00	1516.50	13500.00	33000.00	750000.00	▇▁▁▁▁
INCOME_OPEN_2	7500	0.26	0.00	0.00	0.00	0.00	0.00	0.00	0.00	▁▁▇▁▁
INCOME_CLOSED_ANNUAL	9315	0.08	8.96	3.38	1.00	6.00	11.00	12.00	12.00	▁▂▂▂▇
INCOME_CLOSED_MONTHLY	8748	0.14	7.86	4.12	1.00	4.00	8.00	12.00	12.00	▃▃▂▁▇
INCOME_CLOSED_WEEKLY	9884	0.03	7.49	4.29	1.00	4.00	7.00	12.00	12.00	▅▃▂▁▇
INCOME_CLOSED_HOURLY	10018	0.01	9.11	4.73	1.00	3.00	13.00	13.00	13.00	▃▂▁▁▇
HOURS	0	1.00	34.40	9.18	4.00	30.00	37.50	40.00	84.00	▂▃▇▁▁
BORNUK	0	1.00	1.73	1.97	1.00	1.00	1.00	1.00	10.00	▇▁▁▁▁
Q3v1_1	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
Q3v1_2	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
Q3v1_3	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
Q3v1_4	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
Q3v1_5	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
Q3v2_1	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
Q3v2_2	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
Q3v4a	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
Q3v4b	10155	0.00	NaN	NA	NA	NA	NA	NA	NA
NatRepemployees	0	1.00	1.00	0.59	0.29	0.66	0.79	1.09	6.74	▇▁▁▁▁
NatRepemployeesGraduate	0	1.00	1.00	0.33	0.34	0.76	1.00	1.12	3.89	▇▃▁▁▁

Code

skim(data_2)

Data summary
Name	data_2
Number of rows	10155
Number of columns	86
_______________________
Column type frequency:
character	7
numeric	79
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	empty	n_unique
MIProRspId	1	36	36	0	10155
E6_5_other	1	0	102	10142	14
E8A	1	2	124	0	4946
E8B	1	2	274	0	6559
E9A	1	2	185	0	4369
E10B_9_other	1	0	75	9992	132
Q2_3_other	1	0	93	9988	140

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
D1_Gender_C	0	1.00	1.53	0.54	1.00	1.00	2.00	2.00	4.00e+00	▇▇▁▁▁
D2_Age	0	1.00	42.15	13.03	16.00	32.00	42.00	53.00	8.10e+01	▅▇▇▅▁
D4_Region_C	0	1.00	7.25	3.39	1.00	4.00	7.00	10.00	1.20e+01	▃▅▃▃▇
D3_employment	0	1.00	1.24	0.45	1.00	1.00	1.00	1.00	3.00e+00	▇▁▂▁▁
E2	0	1.00	1.00	0.00	1.00	1.00	1.00	1.00	1.00e+00	▁▁▇▁▁
SCD_Opt_In_1	0	1.00	1.03	0.18	1.00	1.00	1.00	1.00	2.00e+00	▇▁▁▁▁
SCD_Opt_In_2	0	1.00	1.07	0.26	1.00	1.00	1.00	1.00	2.00e+00	▇▁▁▁▁
SCD_Opt_In_3	0	1.00	1.04	0.20	1.00	1.00	1.00	1.00	2.00e+00	▇▁▁▁▁
Ethnicity	343	0.97	2.89	4.26	1.00	1.00	1.00	1.00	2.10e+01	▇▁▁▁▁
D6_educ1	0	1.00	1.47	0.52	1.00	1.00	1.00	2.00	3.00e+00	▇▁▆▁▁
D7_educ2_1	5527	0.46	0.56	0.50	0.00	0.00	1.00	1.00	1.00e+00	▆▁▁▁▇
D7_educ2_2	5527	0.46	0.26	0.44	0.00	0.00	0.00	1.00	1.00e+00	▇▁▁▁▃
D7_educ2_3	5527	0.46	0.21	0.41	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▂
D7_educ2_4	5527	0.46	0.06	0.24	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D7_educ2_5	5527	0.46	0.02	0.15	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D7_educ2_6	5527	0.46	0.02	0.13	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D7_educ2_7	5527	0.46	0.22	0.41	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▂
D7_educ2_8	5527	0.46	0.15	0.36	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▂
D7_educ2_9	5527	0.46	0.06	0.24	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D7_educ2_10	5527	0.46	0.07	0.26	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D7_educ2_11	5527	0.46	0.10	0.30	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D7_educ2_12	5527	0.46	0.05	0.22	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D7_educ2_13	5527	0.46	0.02	0.15	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
ed_bands	0	1.00	2.43	0.68	1.00	2.00	3.00	3.00	3.00e+00	▂▁▅▁▇
TradeUnion	737	0.93	1.71	0.48	1.00	1.00	2.00	2.00	3.00e+00	▃▁▇▁▁
D6_Children_1	0	1.00	0.02	0.14	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D6_Children_2	0	1.00	0.13	0.33	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D6_Children_3	0	1.00	0.10	0.30	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D6_Children_4	0	1.00	0.13	0.33	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D6_Children_5	0	1.00	0.10	0.30	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D6_Children_6	0	1.00	0.07	0.26	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D6_Children_7	0	1.00	0.06	0.24	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
D6_Children_8	0	1.00	0.26	0.44	0.00	0.00	0.00	1.00	1.00e+00	▇▁▁▁▃
D6_Children_9	0	1.00	0.39	0.49	0.00	0.00	0.00	1.00	1.00e+00	▇▁▁▁▅
E1	0	1.00	1.88	0.33	1.00	2.00	2.00	2.00	2.00e+00	▁▁▁▁▇
E3	0	1.00	1.07	0.44	1.00	1.00	1.00	1.00	5.00e+00	▇▁▁▁▁
E5	0	1.00	1.11	0.39	1.00	1.00	1.00	1.00	3.00e+00	▇▁▁▁▁
E6_1	9658	0.05	0.31	0.46	0.00	0.00	0.00	1.00	1.00e+00	▇▁▁▁▃
E6_2	9658	0.05	0.60	0.49	0.00	0.00	1.00	1.00	1.00e+00	▆▁▁▁▇
E6_3	9658	0.05	0.11	0.31	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
E6_4	9658	0.05	0.05	0.21	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
E6_5	9658	0.05	0.03	0.16	0.00	0.00	0.00	0.00	1.00e+00	▇▁▁▁▁
E7A	1	1.00	6.16	2.70	1.00	5.00	6.00	9.00	9.00e+00	▃▂▂▆▇
E7B	0	1.00	1.61	0.49	1.00	1.00	2.00	2.00	2.00e+00	▅▁▁▁▇
E9B	0	1.00	12.67	5.62	1.00	8.00	14.00	17.00	2.20e+01	▂▅▃▇▃
E10A	0	1.00	1.33	0.47	1.00	1.00	1.00	2.00	2.00e+00	▇▁▁▁▃
E10B	6827	0.33	4.55	2.03	1.00	3.00	4.00	6.00	9.00e+00	▂▇▂▆▁
Q1_1	0	1.00	1.84	0.37	1.00	2.00	2.00	2.00	2.00e+00	▂▁▁▁▇
Q1_2	0	1.00	1.81	0.39	1.00	2.00	2.00	2.00	2.00e+00	▂▁▁▁▇
Q1_3	0	1.00	1.67	0.47	1.00	1.00	2.00	2.00	2.00e+00	▃▁▁▁▇
Q1_4	0	1.00	1.79	0.41	1.00	2.00	2.00	2.00	2.00e+00	▂▁▁▁▇
Q1_5	0	1.00	1.80	0.40	1.00	2.00	2.00	2.00	2.00e+00	▂▁▁▁▇
Q1_6	0	1.00	1.71	0.46	1.00	1.00	2.00	2.00	2.00e+00	▃▁▁▁▇
Q2	0	1.00	1.13	0.38	1.00	1.00	1.00	1.00	3.00e+00	▇▁▁▁▁
Q3v3a	0	1.00	2.78	0.57	1.00	3.00	3.00	3.00	3.00e+00	▁▁▁▁▇
Q3v3b	9383	0.08	2.48	0.82	1.00	2.00	3.00	3.00	3.00e+00	▂▁▁▁▇
Q3v3c	9435	0.07	2.35	0.77	1.00	2.00	3.00	3.00	3.00e+00	▃▁▅▁▇
Q3v3d	1492	0.85	2.90	0.41	1.00	3.00	3.00	3.00	3.00e+00	▁▁▁▁▇
D7_Disability1	443	0.96	1.82	0.47	1.00	2.00	2.00	2.00	3.00e+00	▂▁▇▁▁
D8_Disability2	8052	0.21	2.13	0.67	1.00	2.00	2.00	3.00	4.00e+00	▂▇▁▃▁
INCOME_FREQ	0	1.00	1.76	0.85	1.00	1.00	2.00	2.00	4.00e+00	▇▇▁▂▁
INCOME_OPEN_1	2656	0.74	21132.05	29878.94	6.00	1516.50	13500.00	33000.00	7.50e+05	▇▁▁▁▁
INCOME_OPEN_2	7500	0.26	0.00	0.00	0.00	0.00	0.00	0.00	0.00e+00	▁▁▇▁▁
INCOME_CLOSED_ANNUAL	9315	0.08	8.96	3.38	1.00	6.00	11.00	12.00	1.20e+01	▁▂▂▂▇
INCOME_CLOSED_MONTHLY	8748	0.14	7.86	4.12	1.00	4.00	8.00	12.00	1.20e+01	▃▃▂▁▇
INCOME_CLOSED_WEEKLY	9884	0.03	7.49	4.29	1.00	4.00	7.00	12.00	1.20e+01	▅▃▂▁▇
INCOME_CLOSED_HOURLY	10018	0.01	9.11	4.73	1.00	3.00	13.00	13.00	1.30e+01	▃▂▁▁▇
HOURS	0	1.00	34.40	9.18	4.00	30.00	37.50	40.00	8.40e+01	▂▃▇▁▁
BORNUK	0	1.00	1.73	1.97	1.00	1.00	1.00	1.00	1.00e+01	▇▁▁▁▁
unit_code	1	1.00	446.62	251.04	0.00	231.00	412.00	623.00	9.99e+02	▃▇▅▅▃
UnitOccupation	1	1.00	52.72	29.69	1.00	30.00	49.00	83.00	1.04e+02	▆▇▅▇▇
smg_code	1	1.00	44.37	25.12	0.00	23.00	41.00	62.00	9.90e+01	▃▇▅▅▃
MajorsubgroupOccupation	1	1.00	11.06	8.31	1.00	3.00	9.00	18.00	2.70e+01	▇▂▂▃▂
major_code	1	1.00	4.24	2.53	0.00	2.00	4.00	6.00	9.00e+00	▃▇▅▅▃
Majorgroupcode	1	1.00	5.21	3.00	1.00	2.00	5.00	8.00	1.00e+01	▇▅▃▇▅
SectorCode	1	1.00	12.16	5.29	1.00	7.00	13.00	17.00	2.20e+01	▃▇▅▇▆
SectorName	1	1.00	13.34	6.30	1.00	8.00	12.00	19.00	2.30e+01	▂▃▇▂▆
NatRepemployees	0	1.00	1.00	0.59	0.29	0.66	0.79	1.09	6.74e+00	▇▁▁▁▁
NatRepemployeesGraduate	0	1.00	1.00	0.33	0.34	0.76	1.00	1.12	3.89e+00	▇▃▁▁▁

Data preparation

In this section, we clean the second dataset to make it ready for analysis.

Code

# Just use the new dataset
# remove unncessary data objects, then read new data (keep colour palette)
rm(list = setdiff(ls(), "colours"))
data <- read_sav("../Data/JRF Outsourced Workers - Occupations and Sectors.sav")

Rename variables and drop unwanted

Code

# Change column names
data <- data %>%
  rename(
    ID = MIProRspId,
    Gender = D1_Gender_C,
    Age = D2_Age,
    Region = D4_Region_C,
    Employment_Status = D3_employment,
    Employment_Type = E2,
    Consent_1_Ethnicity = SCD_Opt_In_1, 
    Consent_2_TU = SCD_Opt_In_2,                
    Consent_3_Health = SCD_Opt_In_3,
    Has_Degree = D6_educ1,
    Has_Second_Job = E1,
    Who_Pays_You = E3,
    Job_Security = E5,
    Work_Circumstance_Agency = E6_1,
    Work_Circumstance_Contract = E6_2,
    Work_Circumstance_Seasonal = E6_3,                       
    Work_Circumstance_Trainee = E6_4,
    Work_Circumstance_Other = E6_5,                       
    Work_Circumstance_Other_TEXT = E6_5_other,
    Org_Size = E7A,
    Is_Supervisor = E7B,
    Job_Title_TEXT = E8A,
    Main_Activity_TEXT = E8B,
    Employer_Activity_TEXT = E9A,
    Employer_Activity_SIC = E9B, #need to check that these were SIC and translate to digits if so
    Biz_Type = E10A,
    Non_Private_Biz_Type = E10B,
    Non_Private_Biz_Type_TEXT = E10B_9_other,
    # Don't rename the indicator questions and outsourcing questions variables 
    # because it creates more work when running statistics (i.e., remembering 
    # and typing out these longer var names)
    # Paid_One_Work_Other = Q1_1,
    # Third_Party = Q1_2,
    # Employer_Is_Agency = Q1_3,
    # Instructed_By_Other = Q1_4,
    # Work_In_Other = Q1_5,
    # Diff_Uniform = Q1_6,
    # Short_Long_Employ = Q2,
    # Short_Long_Employ_TEXT = Q2_3_other,
    # I_Am_Outsourced = Q3v3a,
    # Outsourced_And_Agency = Q3v3b,
    # Might_Be_Outsourced_And_Agency = Q3v3c,
    # Not_Outsourced_And_Agency = Q3v3d,
    Disability = D7_Disability1,
    Disability_Impact = D8_Disability2,
    #some form of multiplication between INCOME_FREQ and INCOME_OPEN_1 is necessary to create equivalence to an annual salary
  )

# drop unwanted columns
data <- data %>%
  select(-c(starts_with(c("D6_", "D7_"))))

Allocate to groups

As of 17 February 2024, the definitions are:

Outsourced, defined as responding ‘I am sure I am outsourced’ or ‘I might be outsourced’, and responding ‘I do work on a long-term basis’.
Likely agency, defined as those responding ‘I am sure I am agency’ and ‘I do work on a long-term basis’, excluding those people who are already defined as being outsourced.
High indicators: defined as responding TRUE to 5 or 6 of the outsourcing indicators, as well as responding ‘I do work on a long-term basis’, excluding those people who are already defined as outsourced or likely agency.

Code

# calculate indicator data
# invert response function for summing
invert_response <- function(x){
  x <- 2 + (-1*x)
}

# Now for just indicators
indicator_data <- data %>% 
  select(ID, starts_with("Q1")) %>%
  reframe(across(starts_with("Q1"), ~invert_response(.x)), .by = ID) %>%
  rowwise() %>%
  mutate(
    sum_true = sum(Q1_1, Q1_2, Q1_3, Q1_4, Q1_5, Q1_6)
  ) %>% 
  select(-starts_with("Q1"))

# Check ID the same
paste("IDs match: ", sum(data$ID == indicator_data$ID) == nrow(data))

[1] "IDs match:  TRUE"

Code

# Merge into main data set
data <- left_join(data, indicator_data, by = "ID")

# remove redundant indicator data
rm(indicator_data)

#------------------------#
#### Assign to groups ####
#------------------------#
# As of 17 February, the definitions are:
# 
# -   **Outsourced**, defined as responding 'I am sure I am outsourced' or 'I might be outsourced', and responding 'I do work on a long-term basis'.
# -   **Likely agency**, defined as those responding 'I am sure I am agency' and 'I do work on a long-term basis', **excluding** those people who are already defined as being outsourced.
# -   **High indicators**: defined as responding TRUE to 5 or 6 of the outsourcing indicators, as well as responding 'I do work on a long-term basis', **excluding** those people who are already defined as outsourced or likely agency.

data <- data %>%
  mutate(
    # SURE outsourced or MIGHT BE outsourced + LONGTERM
    outsourced = ifelse((Q3v3a == 1 & Q2 == 1) | (Q3v3a == 2 & Q2 == 1), 1, 0),
    # NOT outsourced, SURE agency, and LONG-TERM
    likely_agency = ifelse(outsourced == 0 & Q2 == 1 & (Q3v3b == 1 | Q3v3c == 1 | Q3v3d == 1), 1, 0),
    likely_agency = ifelse(is.na(likely_agency), 0, likely_agency),
    # NOT outsourced, NOT likely agency, 5 or more indicators, & LONGTERM
    high_indicators = ifelse(outsourced == 0 & likely_agency == 0 & (Q2 == 1 & sum_true >= 5), 1, 0)
  )

# count the groupings
print("Totals: 1 = Outsourced, 2 = Likely agency, 3 = High indicators")

[1] "Totals: 1 = Outsourced, 2 = Likely agency, 3 = High indicators"

Code

lapply(list(data$outsourced,
            data$likely_agency,
            data$high_indicators), sum)

[[1]]
[1] 1123

[[2]]
[1] 269

[[3]]
[1] 291

Code

# Flatten these groupings into a single variable

data <- data %>%
  mutate(
    # outsroucing group is subdivided 'type' of outsourcing, 
    # i.e., straight outsourced, likely agency, high indicators
    outsourcing_group = factor(case_when(outsourced == 1 ~ 'Outsourced',
                                         likely_agency == 1 ~ 'Likely agency',
                                         high_indicators == 1 ~ 'High indicators',
                                         TRUE ~ 'Not outsourced'), 
                               levels = c("Not outsourced",
                                          "Outsourced",
                                          "Likely agency",
                                          "High indicators")
    ),
    # outsourcing status is the binary split, anyone who is one of the not
    # outsourced groups is outsourced, everyone else is not outsourced
    outsourcing_status = factor(case_when(outsourced == 1 | likely_agency == 1 | high_indicators == 1 ~ 'Outsourced',
                                         TRUE ~ 'Not outsourced'),
                                levels = c('Not outsourced',
                                           'Outsourced')
    )
  )

Age

Code

# Age to numeric
# Remove 'under 16' level
data <- data %>%
  mutate(
    # Get labels
    Age = haven::as_factor(Age),
    # Drop under 16s as we're interested in adult labour market
    Age = droplevels(Age, exclude = "Under 16"),
    # There are 2 responses that are 'over 80'. Change this to 80 so we can have
    # a numeric variable for age.
    # Set Age to numeric
    Age = as.numeric(case_when(
                      Age == "Over 80" ~ "80",
                      TRUE ~ Age)
                     )
  )

Labels

Applying haven::as_factor() to the whole dataset is inappropriate because it makes data wrangling a lot more cumbersome; for example, writing out labels for conditioning becomes very long, and plots become very busy with many level names (e.g., for occupation variables, ethnicity, etc.). However, some variables need to be have their labels extracted because these vars could ostensibly be numeric ones, and leaving them in their unlabelled state risks confusion later on (e.g., with age, org size). It is therefore necessary to translate the numeric-type variables to their labels, replacing the old values, whilst for the factor-like variables to keep a version of the variable unlabelled.

1: Variables to translate to labels:

Age -> numeric
Gender
Region
Employment_Type
Has_Degree
TradeUnion
Has_Second_Job
Work_Circumstance_*
Org_size
Is_Supervisor
Biz_Type
Disability
Disability_Impact
smg_code
major_code
SectorCode

2: Variables to keep as is and create a labelled duplicate:

Ethnicity
Employment_Status
Who_Pays_You
Job_Security
Employer_Activity_SIC
Non_Private_Biz_Type
Q1_*
Q2
Q3*
INCOME_CLOSED_*
BORNUK
unit_code
UnitOccupation
MajorsubgroupOccupation
Majorgroupcode
SectorName

3: All other variables leave as they are

Consider dropping:

INCOME_OPEN_2: Only 0s and NAs. Completion rate = 0.26

Code

glimpse(data)

Code

# Translate the variables in category 1 above, i.e. translate to labels
# Note age is omitted here because we've done something different with it earlier
variables_to_labels <- c("Gender",
                         "Region",
                         "Employment_Type",
                         "ed_bands",
                         "Has_Degree",
                         "TradeUnion",
                         "Has_Second_Job",
                         colnames(select(data, contains("Work_Circumstance"))),
                         "Org_Size",
                         "Is_Supervisor",
                         "Biz_Type",
                         "Disability",
                         "Disability_Impact",
                         "smg_code",
                         "major_code",
                         "SectorCode"
                         )


# test to see if this method works
# test <- data
# 
# # This works:
# test[variables_to_labels] <- sapply(test[variables_to_labels], haven::as_factor)

# Check it works:
# test <- test %>%
#   relocate(new_age, .after = Age) %>%
#   relocate(new_gender, .after = Gender)

data[variables_to_labels] <- sapply(data[variables_to_labels], haven::as_factor)

Code

variables_to_duplicate <- c("Ethnicity",
                            "Employment_Status",
                            "Who_Pays_You",
                            "Job_Security",
                            "Employer_Activity_SIC",
                            "Non_Private_Biz_Type",
                            colnames(select(data, contains("Q1"))),
                            "Q2",
                            colnames(select(data, contains("Q3"))),
                            colnames(select(data, contains("INCOME_CLOSED"))),
                            "BORNUK",
                            "unit_code",
                            "UnitOccupation",
                            "MajorsubgroupOccupation",
                            "Majorgroupcode",
                            "SectorName")
                            
# Translate the variables in category 1 above. i.e., create labelled duplicates
duplicate_label <- function(data, variables){
  for(i in variables){
    #print(data[i])
    old_var <- data[i]
    new_var <- paste0(i,"_labelled") # append labelled to new var name
    data[new_var] <- haven::as_factor(old_var) # set new var as labelled old var
    
    # locate new var after old var for easy viewing
    data <- data %>%
      relocate(
        new_var, .after = colnames(old_var)
      )
  }
  return(data)
}

# Test this works. It does
# test <- data
# test <- duplicate_label(test, variables_to_duplicate)


data <- duplicate_label(data, variables_to_duplicate)

Ethnicity

We have many ethnicity categories, which is great for disaggregation but challenging for visualising and determining general trends. Here we make a second ethnicity category that collapses ethnicity into fewer, broader categories. The super-categories are formed as follows:

White

English / Welsh / Scottish / Northern Irish / British,
Irish

White other

Gypsy or Irish Traveller
Roma
Any other White background

Black African

African
White and Black African

Black Caribbean

Caribbean
White and Black Caribbean

Black other

Any other Black, Black British, or Caribbean background

South Asian

Indian
Pakistani
Bangladeshi

East Asian

Chinese

Arab

Arab

Mixed other

White and Asian
Any other Mixed / Multiple ethnic background

Other

Any other ethnic group
Any other Asian background
Don’t think of myself as any of these
Prefer not to say
NA

Code

data <- data %>%   
  mutate(     
    Ethnicity_collapsed = forcats::fct_collapse(Ethnicity_labelled,
                                                "White" = c("English / Welsh / Scottish / Northern Irish / British",
                                                            "Irish"),
                                                "White other" = c("Gypsy or Irish Traveller",
                                                                  "Roma",
                                                                  "Any other White background"),
                                                "Black African" = c("African",
                                                                    "White and Black African"),
                                                "Black Caribbean" = c("Caribbean",
                                                                      "White and Black Caribbean"),
                                                "Black other" = c("Any other Black, Black British, or Caribbean background"),     
                                                "South Asian" = c("Indian",
                                                                  "Pakistani", 
                                                                  "Bangladeshi"),
                                                "East Asian" = c("Chinese"),
                                                "Arab" = "Arab",
                                                "Mixed other" = c("White and Asian",
                                                                  "Any other Mixed / Multiple ethnic background"),
                                                "Other" = c("Any other ethnic group",
                                                            "Any other Asian background",
                                                            "Don’t think of myself as any of these",
                                                            "Prefer not to say")
    ),
    Ethnicity_collapsed = replace_na(Ethnicity_collapsed, "Other"), # allocate NA responses to 'Other' group
    Ethnicity_collapsed = forcats::fct_relevel(Ethnicity_collapsed, "White") # make White the reference category
  ) %>%
  relocate(Ethnicity_collapsed, .after = Ethnicity_labelled) # put the new var after the old one

Income

Income has an open question and a closed question. They are mutually exclusive, i.e., respondents can report their income in either open or closed format, but not both:

Code

# Select just the income columns (and ID)
data_subset <- data %>%
  select(c(ID, contains("INCOME", ignore.case = F))) %>%
  select(!ends_with("labelled") & !ends_with("FREQ") & !ends_with("2"))

# Count how many NAs there are in each row. If any respondent has responded to both OPEN and CLOSED questions, the number of NAs in the row should be less than 4.
na_count <- data_subset %>%
  summarise(
    ID = ID,
    NA_per_row = rowSums(is.na(.))
  )

# No rows have less than 4 NAs
sum(na_count$NA_per_row < 4)

[1] 0

Code

# one row has five NAS - they didn't answer ANY income question
sum(na_count$NA_per_row > 4)

[1] 1

Code

# Take a look at this respondent to check
test_id <- na_count$ID[which(na_count$NA_per_row > 4)]
data_subset[which(data_subset$ID == test_id),]

# A tibble: 1 × 6
  ID                    INCOME_OPEN_1 INCOME_CLOSED_ANNUAL INCOME_CLOSED_MONTHLY
  <chr>                         <dbl> <dbl+lbl>            <dbl+lbl>            
1 b9ace7e0-2129-49a3-9…            NA NA                   NA                   
# ℹ 2 more variables: INCOME_CLOSED_WEEKLY <dbl+lbl>,
#   INCOME_CLOSED_HOURLY <dbl+lbl>

Code

number_open <- data_subset %>%
  filter(!is.na(INCOME_OPEN_1)) %>%
  nrow()

number_closed <- data_subset %>%
  filter(is.na(INCOME_OPEN_1)) %>%
  select(c(ID, contains("CLOSED"))) %>%
  nrow()

# but also check how many "prefer not to say"
closed_income <- data %>%
  filter(is.na(INCOME_OPEN_1)) %>%
  select(c(ID, contains("INCOME", ignore.case = F))) %>%
  select(ends_with("labelled"))

pref_not_say <- closed_income %>%
  rowwise() %>%
  summarise(
    sum = sum(c_across(everything()) == "Prefer not to say", na.rm=TRUE)
  ) %>%
  summarise(
    total_not_answered = sum(sum)
  )

closed_answered <- number_closed - pref_not_say$total_not_answered

7499 respondents answered using the open method. 1445 respondents answered using the closed method. 1211 did not answer either.

For the open question, the participant is asked how they want to report their income. To get this to a consistent income period for all participants, we need to create a single income variable that computes the income based on period. This is done by multiplying monthly income by 12, weekly income by 52, and hourly income by hours by 52. The assumption of a 52 working week year is obviously a problem that probably presents inaccuracies for those people reporting on a weekly or hourly basis, whose employment may be more likely to be ad hoc than those reporting annual salaries.

Code

# work out min holiday entitlement and use this to calculate working weeks
weeks_in_year <- 365 / 7 # if we're being pedantic (this is 52.14 weeks)
min_holiday_entitlement <- 28
non_working_weeks <- min_holiday_entitlement/5
working_weeks <- weeks_in_year - non_working_weeks

data <- data %>%
  # get values of labels
  # mutate_all(haven::as_factor) %>%
  mutate(
    # make all annual incomes. Note this assumes 52 working weeks!
    income_annual = case_when(INCOME_FREQ == 1 ~ INCOME_OPEN_1, # annual
                              INCOME_FREQ == 2 ~ INCOME_OPEN_1*12, # monthly
                              INCOME_FREQ == 3 ~ INCOME_OPEN_1 * working_weeks, # weekly
                              INCOME_FREQ == 4 ~ INCOME_OPEN_1 * HOURS * working_weeks, # hourly
                              TRUE ~ NA),
    income_weekly = case_when(INCOME_FREQ == 1 ~ INCOME_OPEN_1 / working_weeks, # annual
                              INCOME_FREQ == 2 ~ (INCOME_OPEN_1 * 12) / working_weeks, # monthly
                              INCOME_FREQ == 3 ~ INCOME_OPEN_1, # weekly 
                              INCOME_FREQ == 4 ~ INCOME_OPEN_1 * HOURS, # hourly
                              TRUE ~ NA),
    income_hourly = income_weekly / HOURS,
    income_annual_percentile = ntile(income_annual, 100),
    income_weekly_percentile = ntile(income_weekly, 100),
    income_hourly_percentile = ntile(income_hourly, 100)
  )

options(scipen = 999)

# Check
# test <- data %>%
#   filter(!is.na(INCOME_OPEN_1)) %>%
#   select(c(INCOME_FREQ,INCOME_OPEN_1, income_annual, income_weekly, HOURS,income_hourly)) %>%
#   mutate(
#     INCOME_FREQ = haven::as_factor(INCOME_FREQ)
#   ) %>%
#   group_by(INCOME_FREQ) %>%
#   slice_head(n=2)

The range of salaries is massive, with extreme values at either end. This suggests to me that respondents may have entered in their salaries incorrectly.

Code

# filter to just cases where income is above the fifth percentile and lower than the 95th? I.e., drop the top and bottom 5%.
income_statistics <- data %>%
  group_by(outsourcing_status) %>%
  summarise(
    mean = weighted.mean(income_annual, w = NatRepemployees, na.rm = T),
    median = wtd.quantile(income_annual, w = NatRepemployees, probs = c(.5), na.rm = T),
    min = wtd.quantile(income_annual, w = NatRepemployees, probs = c(0), na.rm = T),
    max = wtd.quantile(income_annual, w = NatRepemployees, probs = c(1), na.rm = T),
    stdev = sqrt(wtd.var(income_annual, w = NatRepemployees, na.rm = T))
  )

The distribution of income is highly skewed, with extremely high values creating a tail. It is likely that many extreme values are artefacts of the method of reporting. Below, we identify outliers as values 3 standard deviations above or below the mean. We do this iteratively, removing identified outliers and retesting, until no more outliers are identified.

Outliers are given a marker (income_drop == 1) to identify them. In analyses involving income, these outliers can be removed by only selecting cases where income_drop == 0.

Code

# this is removing outliers using annual income
# x <- data
# 
# # calculate the mean and sd, and 3x sd
# mean = Hmisc::wtd.mean(x$income_annual, weights=x$NatRepemployees, na.rm=T)
# std = sqrt(Hmisc::wtd.var(x$income_annual,weights=x$NatRepemployees,na.rm = T))
# Tmin = mean-(3*std)
# Tmax = mean+(3*std)
# 
# # identify the first set of outliers
# outliers <- which(x$income_annual < Tmin | x$income_annual > Tmax)
# outlier_count <- length(outliers) # count how many removed this iteration
# all_outlier_ids <- x$ID[outliers] # list of ids removed (to be removed from data later)
# 
# # initialise some variables, i.e. iteration 0
# count <- 0 # iteration counter
# total_cases_removed <- 0 # total cases removed
# 
# cat("Iteration ", count, ": ", outlier_count, " outliers\n")
# 
# # Take a look at the distribution
# par(mfrow = c(1, 3)) # this lets us plot three plots in a row
# hist(x$income_annual, main = "Histogram") 
# # mark the mean and sds for the histogram
# abline(v = mean, col='red', lwd = 3)
# abline(v = Tmin, col='blue', lwd = 3)
# abline(v = Tmax, col='blue', lwd = 3)
# boxplot(x$income_annual, main = "Boxplot")
# qqnorm(x$income_annual, main = "Normal Q-Q plot")
# mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
# 
# # While there are still outliers detected, remove the outliers and recalculate 
# # mean, sd, and 3*sd and remove the outliers based on these new figures.
# while(length(outliers) != 0){
#   count <- count + 1
#   x <- x[-outliers,] # remove the outliers identified in the previous iteration
#   
#   # recalculate
#   mean = Hmisc::wtd.mean(x$income_annual, weights=x$NatRepemployees, na.rm=T)
#   std = sqrt(Hmisc::wtd.var(x$income_annual,weights=x$NatRepemployees,na.rm = T))
#   Tmin = mean - (3 * std)
#   Tmax = mean + (3 * std)
#   outliers <- which(x$income_annual < Tmin | x$income_annual > Tmax)
#   outlier_ids <- x$ID[outliers] # get outlier ids
#   all_outlier_ids <- append(all_outlier_ids, outlier_ids) # add removed outliers to outlier list
#   total_cases_removed <- total_cases_removed + outlier_count # count total
#   
#   outlier_count <- length(outliers) # count how many removed this iteration
#   cat("Iteration ", count, ": ", outlier_count, " outliers\n")
#   
#   # Replot distributions to see how they've changed
#   par(mfrow = c(1, 3))
#   hist(x$income_annual, main = "Histogram") 
#   abline(v = mean, col='red', lwd = 3)
#   abline(v = Tmin, col='blue', lwd = 3)
#   abline(v = Tmax, col='blue', lwd = 3)
#   boxplot(x$income_annual, main = "Boxplot")
#   qqnorm(x$income_annual, main = "Normal Q-Q plot")
#   mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
# }
# cat(total_cases_removed, " cases removed in total, across ", count, " iterations")
# 
# # Drop the cases identified as outliers from data
# data$income_drop[which(data$ID %in% all_outlier_ids)] <- 1
# data$income_drop[is.na(data$income_drop)] <- 0 # make NAs 0
# 
# # Check this looks right
# sum(data$income_drop, na.rm = T) # this should be equal to all_outlier_ids
# 
# # This should look the same as the last iteration above#
# test <- data %>%
#   filter(income_drop == 0)
# 
# mean = Hmisc::wtd.mean(x$income_annual, weights=x$NatRepemployees, na.rm=T)
# std = sqrt(Hmisc::wtd.var(x$income_annual,weights=x$NatRepemployees,na.rm = T))
# Tmin = mean - (3 * std)
# Tmax = mean + (3 * std)
# 
# par(mfrow = c(1, 3))
# hist(test$income_annual, main = "Histogram") 
# abline(v = mean, col='red', lwd = 3)
# abline(v = Tmin, col='blue', lwd = 3)
# abline(v = Tmax, col='blue', lwd = 3)
# boxplot(test$income_annual, main = "Boxplot")
# qqnorm(test$income_annual, main = "Normal Q-Q plot")
# mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)

Code

# this is removing outliers using weekly income (it results in the same number of cases removed)
x <- data

# calculate the mean and sd, and 3x sd
mean = Hmisc::wtd.mean(x$income_weekly, weights=x$NatRepemployees, na.rm=T)
std = sqrt(Hmisc::wtd.var(x$income_weekly,weights=x$NatRepemployees,na.rm = T))
Tmin = mean-(3*std)
Tmax = mean+(3*std)

# identify the first set of outliers
outliers <- which(x$income_weekly < Tmin | x$income_weekly > Tmax)
outlier_count <- length(outliers) # count how many removed this iteration
all_outlier_ids <- x$ID[outliers] # list of ids removed (to be removed from data later)

# initialise some variables, i.e. iteration 0
count <- 0 # iteration counter
total_cases_removed <- 0 # total cases removed

cat("Iteration ", count, ": ", outlier_count, " outliers\n")

Iteration  0 :  159  outliers

Code

# Take a look at the distribution
par(mfrow = c(1, 3)) # this lets us plot three plots in a row
hist(x$income_weekly, main = "Histogram") 
# mark the mean and sds for the histogram
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_weekly, main = "Boxplot")
qqnorm(x$income_weekly, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)

Code

# While there are still outliers detected, remove the outliers and recalculate 
# mean, sd, and 3*sd and remove the outliers based on these new figures.
while(length(outliers) != 0){
  count <- count + 1
  x <- x[-outliers,] # remove the outliers identified in the previous iteration
  
  # recalculate
  mean = Hmisc::wtd.mean(x$income_weekly, weights=x$NatRepemployees, na.rm=T)
  std = sqrt(Hmisc::wtd.var(x$income_weekly,weights=x$NatRepemployees,na.rm = T))
  Tmin = mean - (3 * std)
  Tmax = mean + (3 * std)
  outliers <- which(x$income_weekly < Tmin | x$income_weekly > Tmax)
  outlier_ids <- x$ID[outliers] # get outlier ids
  all_outlier_ids <- append(all_outlier_ids, outlier_ids) # add removed outliers to outlier list
  total_cases_removed <- total_cases_removed + outlier_count # count total
  
  outlier_count <- length(outliers) # count how many removed this iteration
  cat("Iteration ", count, ": ", outlier_count, " outliers\n")
  
  # Replot distributions to see how they've changed
  par(mfrow = c(1, 3))
  hist(x$income_weekly, main = "Histogram") 
  abline(v = mean, col='red', lwd = 3)
  abline(v = Tmin, col='blue', lwd = 3)
  abline(v = Tmax, col='blue', lwd = 3)
  boxplot(x$income_weekly, main = "Boxplot")
  qqnorm(x$income_weekly, main = "Normal Q-Q plot")
  mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
}

Iteration  1 :  148  outliers

Iteration  2 :  160  outliers

Iteration  3 :  94  outliers

Iteration  4 :  64  outliers

Iteration  5 :  18  outliers

Iteration  6 :  6  outliers

Iteration  7 :  1  outliers

Iteration  8 :  0  outliers

Code

cat(total_cases_removed, " cases removed in total, across ", count, " iterations")

650  cases removed in total, across  8  iterations

Code

# Drop the cases identified as outliers from data
data$income_drop[which(data$ID %in% all_outlier_ids)] <- 1
data$income_drop[is.na(data$income_drop)] <- 0 # make NAs 0

# Check this looks right
sum(data$income_drop, na.rm = T) # this should be equal to all_outlier_ids

[1] 650

Code

# This should look the same as the last iteration above#
test <- data %>%
  filter(income_drop == 0)

mean = Hmisc::wtd.mean(x$income_weekly, weights=x$NatRepemployees, na.rm=T)
std = sqrt(Hmisc::wtd.var(x$income_weekly,weights=x$NatRepemployees,na.rm = T))
Tmin = mean - (3 * std)
Tmax = mean + (3 * std)

par(mfrow = c(1, 3))
hist(test$income_weekly, main = "Histogram") 
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(test$income_weekly, main = "Boxplot")
qqnorm(test$income_weekly, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)

As shown above, the process involved 8 iterations, and in total identified 650 observations which should be removed when examining income data (total non-NA income_annual N after removal = 6849). The resulting income distribution is now more sensibly distributed.

Converting closed responses to continuous

The function below extracts the midpoint of the income brackets, or the value of the “less than” and “over” values.

Code

closed_income <- data %>%
  filter(is.na(INCOME_OPEN_1)) %>%
  select(c(ID, contains("INCOME", ignore.case = F))) %>%
  select(ends_with("labelled"))

convert_to_numeric <- function(data) {
  # Select the closed income columns. We only want the labelled ones.
  matching_columns <- grep("CLOSED", colnames(data), ignore.case = FALSE)
  matching_columns <- matching_columns[grep("labelled", colnames(data)[matching_columns], ignore.case = FALSE)]
  matching_column_names <- colnames(data)[matching_columns]
  
  #column <- "INCOME_CLOSED_ANNUAL_labelled"
  # Iterate through the closed variables
  # For each one, split its range into two separate values and calculate the midpoint,
  # or for "less than" and "over", just take the value and use that
  for(column in matching_column_names){
    temp_df <- data %>%
      #mutate(copy = INCOME_CLOSED_ANNUAL_labelled) %>% # just for testing
      separate_wider_delim(column, names = c("min_income","max_income"), delim=" up to £", too_few = "align_start")
    
    temp_df <- temp_df %>%
      mutate(across(c(min_income, max_income), ~gsub("£","",.))) %>% # sub out the £ sign
      # This is for extracting the less than and over values as well subbing out the comma separators
      # regex = 1-3 digits, comma, three digits, decimal point any number of digits
      # In same operation, we also sub out the comma separator
      mutate(
        min_income = gsub(",","", str_extract(min_income, "\\d{1,3}(?:,\\d{3})*(?:\\.\\d+)?")) 
      ) %>%
      # sub out the comma separator again (this is just going to be for max_income, but leave in min_income anyway)
      mutate(across(c(min_income, max_income), ~gsub(",","",.))) %>%
      mutate(across(c(min_income, max_income), as.numeric)) %>% # note we lose 'prefer not to say' here
      # Where we have both min and max, calculate the midpoint, otherwise, just keep min 
      mutate(midpoint = case_when(!is.na(min_income) & !is.na(max_income) ~ (min_income + max_income) / 2,
                                TRUE ~ min_income)
      )

    # Add the new variable into the data
    data <- data %>%
      mutate(
        midpoint = temp_df$midpoint, .after = column
      )
    # make a sensible colname 
    colnames(data)[colnames(data) == "midpoint"] <- paste0(column, "_midpoint")
    }
  
  return(data)
}

data <- convert_to_numeric(data)

Now calculate annual income in the same way as we did for the open questions

Code

data <- data %>%
  # get values of labels
  # mutate_all(haven::as_factor) %>%
  mutate(
    # make all annual incomes. Note this assumes 52 working weeks!
    income_annual_closed = case_when(INCOME_FREQ == 1 ~ INCOME_CLOSED_ANNUAL_labelled_midpoint,
                              INCOME_FREQ == 2 ~ INCOME_CLOSED_MONTHLY_labelled_midpoint*12,
                              INCOME_FREQ == 3 ~ INCOME_CLOSED_WEEKLY_labelled_midpoint*working_weeks,
                              INCOME_FREQ == 4 ~ INCOME_CLOSED_HOURLY_labelled_midpoint*HOURS*working_weeks,
                              TRUE ~ NA),
    income_weekly_closed = case_when(INCOME_FREQ == 1 ~ INCOME_CLOSED_ANNUAL_labelled_midpoint / working_weeks,
                              INCOME_FREQ == 2 ~ (INCOME_CLOSED_MONTHLY_labelled_midpoint*12) / working_weeks,
                              INCOME_FREQ == 3 ~ INCOME_CLOSED_WEEKLY_labelled_midpoint,
                              INCOME_FREQ == 4 ~ INCOME_CLOSED_HOURLY_labelled_midpoint*HOURS,
                              TRUE ~ NA),
    income_hourly_closed = income_weekly_closed / HOURS,
    income_percentile_closed = ntile(income_annual, 100),
    income_weekly_closed_percentile = ntile(income_weekly_closed, 100),
    income_hourly_closed_percentile = ntile(income_hourly_closed, 100),
    
  )

Next, combine the open and closed into a single variable.

Code

data <- data %>%
  mutate(
    income_annual_all = case_when(!is.na(income_annual) ~ income_annual,
                                   !is.na(income_annual_closed) ~ income_annual_closed,
                                   TRUE ~ NA),
    income_weekly_all = case_when(!is.na(income_weekly) ~ income_weekly,
                                   !is.na(income_weekly_closed) ~ income_weekly_closed,
                                   TRUE ~ NA),
    income_hourly_all = case_when(!is.na(income_hourly) ~ income_hourly,
                                   !is.na(income_hourly_closed) ~ income_hourly_closed,
                                   TRUE ~ NA)
    
  )

# Double check this leads to what we expect
# Yes, this provides the same number of non-na values as calculated above,
# i.e., 1213 total nas: 1212 from closed reponses, 1 from open responses
sum(!is.na(data$income_annual_all))

[1] 8943

Code

sum(is.na(data$income_annual_all))

[1] 1212

Code

sum(!is.na(data$income_weekly_all))

[1] 8943

Code

sum(is.na(data$income_weekly_all))

[1] 1212

Code

sum(!is.na(data$income_hourly_all))

[1] 8943

Code

sum(is.na(data$income_hourly_all))

[1] 1212

Apply the same outliers removal process as done above for the income_all_subset

Code

# annual income all - same result as weekly
x <- data

# calculate the mean and sd, and 3x sd
mean = Hmisc::wtd.mean(x$income_annual_all, weights=x$NatRepemployees, na.rm=T)
std = sqrt(Hmisc::wtd.var(x$income_annual_all,weights=x$NatRepemployees,na.rm = T))
Tmin = mean-(3*std)
Tmax = mean+(3*std)

# identify the first set of outliers
outliers <- which(x$income_annual_all < Tmin | x$income_annual_all > Tmax)
outlier_count <- length(outliers) # count how many removed this iteration
all_outlier_ids <- x$ID[outliers] # list of ids removed (to be removed from data later)

# initialise some variables, i.e. iteration 0
count <- 0 # iteration counter
total_cases_removed <- 0 # total cases removed

cat("Iteration ", count, ": ", outlier_count, " outliers\n")

Iteration  0 :  173  outliers

Code

# Take a look at the distribution
par(mfrow = c(1, 3)) # this lets us plot three plots in a row
hist(x$income_annual_all, main = "Histogram") 
# mark the mean and sds for the histogram
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_annual_all, main = "Boxplot")
qqnorm(x$income_annual_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)

Code

# While there are still outliers detected, remove the outliers and recalculate 
# mean, sd, and 3*sd and remove the outliers based on these new figures.
while(length(outliers) != 0){
  count <- count + 1
  x <- x[-outliers,] # remove the outliers identified in the previous iteration
  
  # recalculate
  mean = Hmisc::wtd.mean(x$income_annual_all, weights=x$NatRepemployees, na.rm=T)
  std = sqrt(Hmisc::wtd.var(x$income_annual_all,weights=x$NatRepemployees,na.rm = T))
  Tmin = mean - (3 * std)
  Tmax = mean + (3 * std)
  outliers <- which(x$income_annual_all < Tmin | x$income_annual_all > Tmax)
  outlier_ids <- x$ID[outliers] # get outlier ids
  all_outlier_ids <- append(all_outlier_ids, outlier_ids) # add removed outliers to outlier list
  total_cases_removed <- total_cases_removed + outlier_count # count total
  
  outlier_count <- length(outliers) # count how many removed this iteration
  cat("Iteration ", count, ": ", outlier_count, " outliers\n")
  
  # Replot distributions to see how they've changed
  par(mfrow = c(1, 3))
  hist(x$income_annual_all, main = "Histogram") 
  abline(v = mean, col='red', lwd = 3)
  abline(v = Tmin, col='blue', lwd = 3)
  abline(v = Tmax, col='blue', lwd = 3)
  boxplot(x$income_annual_all, main = "Boxplot")
  qqnorm(x$income_annual_all, main = "Normal Q-Q plot")
  mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
}

Iteration  1 :  155  outliers

Iteration  2 :  180  outliers

Iteration  3 :  88  outliers

Iteration  4 :  46  outliers

Iteration  5 :  9  outliers

Iteration  6 :  1  outliers

Iteration  7 :  0  outliers

Code

cat(total_cases_removed, " cases removed in total, across ", count, " iterations")

652  cases removed in total, across  7  iterations

Code

# Drop the cases identified as outliers from data
data$income_drop_all[which(data$ID %in% all_outlier_ids)] <- 1
data$income_drop_all[is.na(data$income_drop_all)] <- 0 # make NAs 0

# Check this looks right
sum(data$income_drop_all, na.rm = T) # this should be equal to all_outlier_ids

[1] 652

Code

# This should look the same as the last iteration above#
test <- data %>%
  filter(income_drop_all == 0)

mean = Hmisc::wtd.mean(x$income_annual_all, weights=x$NatRepemployees, na.rm=T)
std = sqrt(Hmisc::wtd.var(x$income_annual_all,weights=x$NatRepemployees,na.rm = T))
Tmin = mean - (3 * std)
Tmax = mean + (3 * std)

par(mfrow = c(1, 3))
hist(test$income_annual_all, main = "Histogram") 
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(test$income_annual_all, main = "Boxplot")
qqnorm(test$income_annual_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)

Code

# income weekly all
x <- data

# calculate the mean and sd, and 3x sd
mean = Hmisc::wtd.mean(x$income_weekly_all, weights=x$NatRepemployees, na.rm=T)
std = sqrt(Hmisc::wtd.var(x$income_weekly_all,weights=x$NatRepemployees,na.rm = T))
Tmin = mean-(3*std)
Tmax = mean+(3*std)

# identify the first set of outliers
outliers <- which(x$income_weekly_all < Tmin | x$income_weekly_all > Tmax)
outlier_count <- length(outliers) # count how many removed this iteration
all_outlier_ids <- x$ID[outliers] # list of ids removed (to be removed from data later)

# initialise some variables, i.e. iteration 0
count <- 0 # iteration counter
total_cases_removed <- 0 # total cases removed

cat("Iteration ", count, ": ", outlier_count, " outliers\n")

Iteration  0 :  173  outliers

Code

# Take a look at the distribution
par(mfrow = c(1, 3)) # this lets us plot three plots in a row
hist(x$income_weekly_all, main = "Histogram") 
# mark the mean and sds for the histogram
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_weekly_all, main = "Boxplot")
qqnorm(x$income_weekly_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)

Code

# While there are still outliers detected, remove the outliers and recalculate 
# mean, sd, and 3*sd and remove the outliers based on these new figures.
while(length(outliers) != 0){
  count <- count + 1
  x <- x[-outliers,] # remove the outliers identified in the previous iteration
  
  # recalculate
  mean = Hmisc::wtd.mean(x$income_weekly_all, weights=x$NatRepemployees, na.rm=T)
  std = sqrt(Hmisc::wtd.var(x$income_weekly_all,weights=x$NatRepemployees,na.rm = T))
  Tmin = mean - (3 * std)
  Tmax = mean + (3 * std)
  outliers <- which(x$income_weekly_all < Tmin | x$income_weekly_all > Tmax)
  outlier_ids <- x$ID[outliers] # get outlier ids
  all_outlier_ids <- append(all_outlier_ids, outlier_ids) # add removed outliers to outlier list
  total_cases_removed <- total_cases_removed + outlier_count # count total
  
  outlier_count <- length(outliers) # count how many removed this iteration
  cat("Iteration ", count, ": ", outlier_count, " outliers\n")
  
  # Replot distributions to see how they've changed
  par(mfrow = c(1, 3))
  hist(x$income_weekly_all, main = "Histogram") 
  abline(v = mean, col='red', lwd = 3)
  abline(v = Tmin, col='blue', lwd = 3)
  abline(v = Tmax, col='blue', lwd = 3)
  boxplot(x$income_weekly_all, main = "Boxplot")
  qqnorm(x$income_weekly_all, main = "Normal Q-Q plot")
  mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
}

Iteration  1 :  155  outliers

Iteration  2 :  180  outliers

Iteration  3 :  88  outliers

Iteration  4 :  46  outliers

Iteration  5 :  9  outliers

Iteration  6 :  1  outliers

Iteration  7 :  0  outliers

Code

cat(total_cases_removed, " cases removed in total, across ", count, " iterations")

652  cases removed in total, across  7  iterations

Code

# Drop the cases identified as outliers from data
data$income_drop_all[which(data$ID %in% all_outlier_ids)] <- 1
data$income_drop_all[is.na(data$income_drop_all)] <- 0 # make NAs 0

# Check this looks right
sum(data$income_drop_all, na.rm = T) # this should be equal to all_outlier_ids

[1] 652

Code

# This should look the same as the last iteration above#
test <- data %>%
  filter(income_drop_all == 0)

mean = Hmisc::wtd.mean(x$income_weekly_all, weights=x$NatRepemployees, na.rm=T)
std = sqrt(Hmisc::wtd.var(x$income_weekly_all,weights=x$NatRepemployees,na.rm = T))
Tmin = mean - (3 * std)
Tmax = mean + (3 * std)

par(mfrow = c(1, 3))
hist(test$income_weekly_all, main = "Histogram") 
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(test$income_weekly_all, main = "Boxplot")
qqnorm(test$income_weekly_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)

Applying income brackets - revisons on 30/09/2024

After further discussion and consultation with JRF analysts, who felt the 80% threshold was high, we have reverted to use 2/3 ASHE regional weekly pay as the threshold. This is also consistent with what JRF have done in the past and what the OECD uses (see https://data.oecd.org/earnwage/wage-levels.htm).

Code

# get the ashe data
# from https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/earningsandhoursworkedukregionbyagegroup
url <- 'https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/earningsandhoursworkedukregionbyagegroup/2023provisional/ashetableregionbyage2023provisional.zip'

filename <- basename(url) # this takes the filename from the url
filepath <- paste0("../data/", filename)
output_dir <- substr(filepath,start = 1, stop=nchar(filepath) - 4)

# check if the file exists. if it doesn't, download and unzip
if(!file.exists(filepath)){ 
  cat("Downloading data\n")
  download.file(url, destfile = filepath, mode = "wb")
  unzip(filepath,exdir=output_dir)
} else{
  cat("Data already in directory. Loading it.\n")
}

Data already in directory. Loading it.

Code

# Specify the directory
# Use list.files to get all CSV files in the directory
files <- list.files(path = output_dir, pattern = '* Weekly pay - Gross 2023.xls$', full.names = TRUE)

ashe_data <- readxl::read_excel(files[1], sheet = 'All', skip = 4) %>%
  filter(!is.na(Code)) %>%
  #select(-c(last_col(offset=2):last_col(), contains('change'))) %>% # if we want some other variables
  janitor::clean_names() %>%
  rename(
    #jobs_thousands = thousand,
    Region = description,
    Region_median_income = median
  ) %>%
  select(Region, Region_median_income) %>% # if we just want the median
  # rename problematic regions
  mutate(
    Region = case_when(Region == "East" ~ "East of England",
                       Region == "Yorkshire and The Humber" ~ "Yorkshire and the Humber",
                       TRUE ~ Region),
    Region_median_income = as.numeric(Region_median_income)
    )

Code

# join to data
data <- data %>%
  left_join(., ashe_data, by = "Region")

# calcualte 4/5 thresholds
data <- data %>%
  mutate(
    Region_four_fifths =  Region_median_income * .8,
    Region_two_thirds = Region_median_income * (2/3),
    UK_median = ashe_data %>% filter(Region == "United Kingdom") %>% pull(Region_median_income),
    UK_four_fifths = UK_median * 0.8
  )


# stats by region
income_statistics_regional <- data %>%
  filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
  group_by(Region) %>%
  summarise(
    mean = weighted.mean(income_weekly_all, w = NatRepemployees, na.rm = T),
    median = wtd.quantile(income_weekly_all, w = NatRepemployees, probs = c(.5), na.rm = T),
    min = wtd.quantile(income_weekly_all, w = NatRepemployees, probs = c(0), na.rm = T),
    max = wtd.quantile(income_weekly_all, w = NatRepemployees, probs = c(1), na.rm = T),
    stdev = sqrt(wtd.var(income_weekly_all, w = NatRepemployees, na.rm = T)),
    n = n()
  )

par(mar = c(2, 2, 2, 2))
# plot the distribution of income 
# blue lines are regional values, red lines are UK values
# lighter lines are the actual medians, darker lines are 4/5 medians
# data %>%
#   filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
#   ggplot(., aes(x="",y=income_weekly_all)) + 
#   facet_wrap(~Region) + 
#   geom_violin() +
#   geom_boxplot(width = 0.3) +
#   geom_hline(aes(yintercept = UK_median), colour = "red", alpha=0.25) +
#   geom_hline(aes(yintercept = UK_four_fifths), colour = "red") +
#   geom_hline(aes(yintercept = Region_median_income), colour = "blue", alpha = 0.25) +
#   geom_hline(aes(yintercept = Region_four_fifths), colour = "blue") +
#   theme_minimal()

# let's take a look at two-thrids too 
# here blue lines are 2/3, red lines are 4/5
data %>%
  filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
  ggplot(., aes(x="",y=income_weekly_all)) + 
  facet_wrap(~Region) + 
  geom_violin() +
  geom_boxplot(width = 0.3) +
  geom_hline(aes(yintercept = Region_four_fifths), colour = "red") +
  geom_hline(aes(yintercept = Region_two_thirds), colour = "blue") +
  theme_minimal()

Code

# use 2/3 becuase that's what OECD use and what JRF have used in past
# https://data.oecd.org/earnwage/wage-levels.htm

# create the grouping variable based on Region_two_thirds
# low pay is less than or equal to 2/3 regional median earnigns
data <- data %>%
  mutate(
    income_group = case_when(income_weekly_all <= Region_two_thirds  ~ "Low",
                             income_weekly_all > Region_two_thirds ~ "Not low",
                             TRUE ~ NA),
    income_group = factor(income_group, levels = c("Not low","Low"), exclude=NA)
  ) %>%
  ungroup()


# visualise it to check
# 26.77% of people are low paid
data %>%
  filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
  dplyr::group_by(income_group) %>%
  dplyr::summarise(
    n = sum(NatRepemployees)
  ) %>%
  mutate(
    perc = 100 * (n / sum(n))
  ) %>%
  ggplot(aes("", perc, fill=income_group)) +
  geom_col() +
  theme_minimal() +
  #theme(plot.margin = unit(c(0,4,0,4), "inches")) +
  geom_label(aes(label=paste0(round(perc,2),"%")))

Code

## comparing outsoruced to not outsourced
## 25.88% NO are low paid, 31.2% O are low paid
data %>%
  filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
  dplyr::group_by(outsourcing_status, income_group) %>%
  dplyr::summarise(
    n = sum(NatRepemployees)
  ) %>%
  mutate(
    perc = 100 * (n / sum(n))
  ) %>%
  ggplot(aes(outsourcing_status, perc, fill=income_group)) +
  geom_col() +
  theme_minimal() +
  #theme(plot.margin = unit(c(0,4,0,4), "inches")) +
  geom_label(aes(label=paste0(round(perc,2),"%")))

Code

# one last check to see people have been categorised correctly
# test <- data %>%
#   filter(income_drop==0) %>% # why do we still get nas for annual income?
#   select(Annual_Income, Region_four_fifths, income_group) %>%
#   mutate(
#     test = case_when(Annual_Income < Region_four_fifths ~ "Low",
#                      Annual_Income >= Region_four_fifths ~ "Not low",
#                      TRUE ~ NA)
#   )
# # check that they're the same - value should be 0
# sum(test$income_group != test$test, na.rm=T)

Code

# test significance
mod <- glm(income_group ~ outsourcing_status, data, family="binomial", weights = NatRepemployees)
summary(mod)


Call:
glm(formula = income_group ~ outsourcing_status, family = "binomial", 
    data = data, weights = NatRepemployees)

Coefficients:
                             Estimate Std. Error z value             Pr(>|z|)
(Intercept)                  -1.13900    0.02717 -41.918 < 0.0000000000000002
outsourcing_statusOutsourced  0.21801    0.06292   3.465              0.00053
                                
(Intercept)                  ***
outsourcing_statusOutsourced ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 10002.1  on 8942  degrees of freedom
Residual deviance:  9990.4  on 8941  degrees of freedom
  (1212 observations deleted due to missingness)
AIC: 10890

Number of Fisher Scoring iterations: 4

Code

test <- summary(mod)

or <- exp(mod[["coefficients"]][["outsourcing_statusOutsourced"]])
p <- test[["coefficients"]][2,4]

The difference is confirmed statistically. An outsourced person is 1.24 times more likely to be in the low income group compared to a non-outsourced person (p = 0.001). In the full analysis we can control for other factors when testing this relationship (e.g., age, gender, etc).

Improving N for income

When reporting income, respondents could choose whether to report income using an open text method or a bracket method. All income-based work in this report so far has used only the open text methods. We may be interested to combine data from the CLOSED questions with the OPEN ones, although there is not a clear method for doing so. Possible options are:

Make all numeric. This would involve taking the midpoint of the bracket for CLOSED responses and using that as a numeric value. I think this is very questionable because it will impact the distribution and effectively make it meaningless.
Make all brackets. This would involve categorising the numeric income data into the same brackets that are present in the CLOSED method. This would maximise our coverage - as all incomes can be sensibly harmonised - but it would mean we don’t have a distribution to work with, which limits how we apply thresholds to responses and limits the resolution with which we can look at the income data.
Continue to keep them separate. This involves doing what we can separately with the two types of reporting method. Advantage is we maximise coverage. Disadvantages are that it will double the work and muddy the picture (as we have to consider income in two different ways).

Code

income_subset <- data %>%
  select(INCOME_FREQ, contains("INCOME_OPEN"), income_annual, contains("INCOME_CLOSED_")) %>%
  mutate(
    INCOME_FREQ = haven::as_factor(INCOME_FREQ)
  )

# Check whether any of the income closed variables has non-missing on same rows as income open (derived). There are none.
vars <- colnames(select(income_subset, contains("INCOME_CLOSED")))
for(i in vars){
  cat(paste0(i,": ",sum(!is.na(income_subset$income_annual) & !is.na(income_subset[i]))), "\n")
}

INCOME_CLOSED_ANNUAL: 0 
INCOME_CLOSED_ANNUAL_labelled: 0 
INCOME_CLOSED_ANNUAL_labelled_midpoint: 0 
INCOME_CLOSED_MONTHLY: 0 
INCOME_CLOSED_MONTHLY_labelled: 0 
INCOME_CLOSED_MONTHLY_labelled_midpoint: 0 
INCOME_CLOSED_WEEKLY: 0 
INCOME_CLOSED_WEEKLY_labelled: 0 
INCOME_CLOSED_WEEKLY_labelled_midpoint: 0 
INCOME_CLOSED_HOURLY: 0 
INCOME_CLOSED_HOURLY_labelled: 0 
INCOME_CLOSED_HOURLY_labelled_midpoint: 0

Code

# Skim indicates completion of closed weekly/hourly is really low, but monthyl/annual is a bit better. We might want to try and combine somehow. Think about code to exract hte numbers from brackets, then take the midpoint of the two numbers. Extract after '£', to ' '. But into two separate elements to start
skim(income_subset)

Data summary
Name	income_subset
Number of rows	10155
Number of columns	16
_______________________
Column type frequency:
factor	5
numeric	11
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
INCOME_FREQ	0	1.00	FALSE	4	Ann: 4515, Mon: 4215, Wee: 764, Hou: 661
INCOME_CLOSED_ANNUAL_labelled	9315	0.08	FALSE	12	Pre: 380, £22: 74, £28: 60, £16: 58
INCOME_CLOSED_MONTHLY_labelled	8748	0.14	FALSE	12	Pre: 635, £1,: 157, £94: 125, £1,: 120
INCOME_CLOSED_WEEKLY_labelled	9884	0.03	FALSE	12	Pre: 118, £33: 32, £11: 29, £22: 24
INCOME_CLOSED_HOURLY_labelled	10018	0.01	FALSE	11	Pre: 78, £10: 31, £12: 9, £14: 6

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
INCOME_OPEN_1	2656	0.74	21132.05	29878.94	6.00	1516.5	13500.0	33000.0	750000	▇▁▁▁▁
INCOME_OPEN_2	7500	0.26	0.00	0.00	0.00	0.0	0.0	0.0	0	▁▁▇▁▁
income_annual	2656	0.74	46884.73	104977.77	1440.00	19200.0	27588.0	40000.0	2559857	▇▁▁▁▁
INCOME_CLOSED_ANNUAL	9315	0.08	8.96	3.38	1.00	6.0	11.0	12.0	12	▁▂▂▂▇
INCOME_CLOSED_ANNUAL_labelled_midpoint	9695	0.05	33070.85	14140.52	5600.00	25200.5	30800.5	47600.5	56000	▃▇▃▆▇
INCOME_CLOSED_MONTHLY	8748	0.14	7.86	4.12	1.00	4.0	8.0	12.0	12	▃▃▂▁▇
INCOME_CLOSED_MONTHLY_labelled_midpoint	9383	0.08	1873.08	1043.83	470.00	1175.5	1645.5	2585.5	4700	▇▇▂▂▂
INCOME_CLOSED_WEEKLY	9884	0.03	7.49	4.29	1.00	4.0	7.0	12.0	12	▅▃▂▁▇
INCOME_CLOSED_WEEKLY_labelled_midpoint	10002	0.02	390.74	228.44	110.00	165.0	385.5	495.5	1100	▇▆▂▂▁
INCOME_CLOSED_HOURLY	10018	0.01	9.11	4.73	1.00	3.0	13.0	13.0	13	▃▂▁▁▇
INCOME_CLOSED_HOURLY_labelled_midpoint	10096	0.01	13.13	4.60	8.91	11.0	11.0	13.0	30	▇▂▁▁▁

Feedback from 8 May was to incorporate the closed responses with the open responses. This has been done in the section above Converting closed responses to continuous.

Sector

Create a variable for shortened sector names to make them nicer to read.

Code

data <- data %>%
  mutate(
    SectorName_short = SectorName_labelled
  ) %>%
  # make the sector names more readable
  separate_wider_delim(SectorName_short, names = c("SectorName_short", "SectorName_short_detail"), delim=";",
                       too_few = "align_start") %>%
  mutate(
    SectorName_short = factor(stringr::str_to_sentence(SectorName_short)),
    SectorName_short_detail = factor(stringr::str_to_sentence(SectorName_short_detail))
  )

Region

Make region a factor and make the reference London.

Code

data <- data %>%
  mutate(
    Region = forcats::fct_relevel(factor(Region), "London")
  )

Some final cleaning that was at beginning of analysis script

Code

# Make our disaggregated ethnicity groups identical to the original census groups
# as per request in JRF - Outsourced workers - To dos 

data$Ethnicity_collapsed_disaggregated <- data$Ethnicity_labelled

data <- data %>%
  mutate(
    Ethnicity_collapsed = case_when(
      # Grouping White ethnicities
      Ethnicity %in% c(1) ~ "White British", # Just white british as reference
      Ethnicity %in% c(2, 3, 4, 5) ~ "White other", # Gyspy, Irish traveller, Roma, Irish and White other grouped together
      # Grouping Asian ethnicities
      Ethnicity %in% c(10,11,12,13,14) ~ "Asian/Asian British",
      # Grouping Black ethnicities
      Ethnicity %in% c(15,16,17) ~ "Black/African/Caribbean/Black British",
      # Grouping Mixed ethnicities
      Ethnicity %in% c(6,7,8,9) ~ "Mixed/Multiple ethnic group",
      # Grouping Other ethnicities
      Ethnicity %in% c(18) ~ "Arab/British Arab",
      # Handling missing or ambiguous categories
      Ethnicity %in% c(19)  ~ "Other ethnic group",
      #prefer not to say
      Ethnicity %in% c(20,21) ~ "Prefer not to say",
      # Default case for any unmatched entries
      TRUE ~ "Prefer not to say"
    )
  )

#make white the reference category
data$Ethnicity_collapsed <- relevel(factor(data$Ethnicity_collapsed), ref = "White British")

data <- data %>%
  mutate(
    Has_Degree = factor(Has_Degree, levels = c("No", "Yes", "Don't know"))
  )

# make binary born uk var
categories <- as.vector(unique(data$BORNUK_labelled))
non_categories <- categories[!(categories %in% "I was born in the UK")]

# Will throw NA warning. I think this OK but investigate how to avoid the problem
data <- data %>%
  mutate(
    BORNUK_binary = forcats::fct_collapse(BORNUK_labelled,
                                      "Born in UK" = "I was born in the UK",
                                      "Not born in UK" = non_categories)
  ) 

# make binary ethnicity var
ethnicities <- as.vector(unique(data$Ethnicity_collapsed))
non_white_ethnicities <- ethnicities[!(ethnicities %in% "White British")]

# Will throw NA warning. I think this OK but investigate how to avoid the problem
data <- data %>%
  mutate(
    Ethnicity_binary = forcats::fct_collapse(Ethnicity_collapsed,
                                      "White British" = c("White British"),
                                      "Non-White British" = non_white_ethnicities)
  )

Save

Code

check <- readline("Save data? (y/n)")

Save data? (y/n)

Code

if(check=="y"){
  saveRDS(data, file = paste0("../Data/", format(Sys.Date(), "%Y-%m-%d"), " - Cleaned_Data.rds"))
# haven::write_sav(as.data.frame(data), path = paste0("../Data/", format(Sys.Date(), "%Y-%m-%d"), " - Cleaned_Data.sav"))
  write_csv(data, file = paste0("../Data/", format(Sys.Date(), "%Y-%m-%d"), " - Cleaned_Data.csv"))
  print("File saved")
} else{
  print("File not saved")
}

[1] "File not saved"