Code
rm(list = ls())
# Load libraries
library(gridExtra)
library(skimr)
library(haven)
library(tidyverse)
library(wesanderson)
library(Hmisc)
<- wes_palette("GrandBudapest2",4,"discrete") colours
rm(list = ls())
# Load libraries
library(gridExtra)
library(skimr)
library(haven)
library(tidyverse)
library(wesanderson)
library(Hmisc)
<- wes_palette("GrandBudapest2",4,"discrete") colours
We received two datasets on two separate occasions from Opinium. The second contains occupation and sector data that the first didn’t include. The first thing we need to do is understand the differences between these two datasets to determine whether we can simply choose one dataset, and which one this should be.
The second dataset is an updated version of the first. Although there are some variables that are present in one dataset but not in the other, investigating these differences indicates that there is no problem with simply using the second dataset instead of the first. Read on for details on the steps taken to determine this.
# Read the first data we received from opinium
<- read_sav("../Data/UK23626 Workers sample data with nat rep and graduates weight.sav")
data # View(data)
# read the second data we received from opinium. This includes more job/sector information
<- read_sav("../Data/JRF Outsourced Workers - Occupations and Sectors.sav")
data_2 # skim(data_2)
# Investigate the differences between the two datasets
<- function(data1, data2){
check_columns <- c()
variables_absent <- 0
total <- colnames(data1)
cols1 <- colnames(data2)
cols2
# For each item in cols1, check whether it exists in cols2
for(x in seq_along(cols1)){
= cols1[x]
this_var = this_var %in% cols2
present = total + present
total = length(cols1) - total
missing # Create vector of all missing vars
if(!present){
= append(variables_absent, this_var)
variables_absent
}
}cat(paste0(missing, " columns in ", deparse(substitute(data1)), " missing from ", deparse(substitute(data2)),":"))
cat(paste0("\n",variables_absent))
cat("\n")
return(variables_absent)
}
<- check_columns(data, data_2) vars_absent1
11 columns in data missing from data_2:
StartTime
Agegender_recode_useforquota
Q3v1_1
Q3v1_2
Q3v1_3
Q3v1_4
Q3v1_5
Q3v2_1
Q3v2_2
Q3v4a
Q3v4b
<- check_columns(data_2, data) vars_absent2
9 columns in data_2 missing from data:
ed_bands
unit_code
UnitOccupation
smg_code
MajorsubgroupOccupation
major_code
Majorgroupcode
SectorCode
SectorName
Looking at the variables in data that are missing from data_2, all but two are completely missing (complete_rate = 0), and the remaining two (startTime and agegender_recode_useforquota) are not needed. This exercise indicates that we can just switch to using data_2 instead of data.
<- dplyr::select(data, dplyr::all_of(vars_absent1))
vars_absent1_subset
skim(vars_absent1_subset)
Name | vars_absent1_subset |
Number of rows | 10155 |
Number of columns | 11 |
_______________________ | |
Column type frequency: | |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
StartTime | 0 | 1 | 152128.49 | 49331.93 | 4 | 125642 | 160423 | 185037.5 | 235940 | ▁▂▅▇▅ |
Agegender_recode_useforquota | 0 | 1 | 5.54 | 2.72 | 1 | 3 | 6 | 8.0 | 11 | ▇▅▅▇▁ |
Q3v1_1 | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v1_2 | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v1_3 | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v1_4 | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v1_5 | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v2_1 | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v2_2 | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v4a | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v4b | 10155 | 0 | NaN | NA | NA | NA | NA | NA | NA |
Visual inspection of a skim of each dataset indicates they are the same.
skim(data)
Name | data |
Number of rows | 10155 |
Number of columns | 88 |
_______________________ | |
Column type frequency: | |
character | 7 |
numeric | 81 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
MIProRspId | 0 | 1 | 36 | 36 | 0 | 10155 | 0 |
E6_5_other | 0 | 1 | 0 | 102 | 10142 | 14 | 0 |
E8A | 0 | 1 | 2 | 124 | 0 | 4946 | 0 |
E8B | 0 | 1 | 2 | 274 | 0 | 6559 | 0 |
E9A | 0 | 1 | 2 | 185 | 0 | 4369 | 0 |
E10B_9_other | 0 | 1 | 0 | 75 | 9992 | 132 | 0 |
Q2_3_other | 0 | 1 | 0 | 93 | 9988 | 140 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
StartTime | 0 | 1.00 | 152128.49 | 49331.93 | 4.00 | 125642.00 | 160423.00 | 185037.50 | 235940.00 | ▁▂▅▇▅ |
D1_Gender_C | 0 | 1.00 | 1.53 | 0.54 | 1.00 | 1.00 | 2.00 | 2.00 | 4.00 | ▇▇▁▁▁ |
D2_Age | 0 | 1.00 | 42.15 | 13.03 | 16.00 | 32.00 | 42.00 | 53.00 | 81.00 | ▅▇▇▅▁ |
Agegender_recode_useforquota | 0 | 1.00 | 5.54 | 2.72 | 1.00 | 3.00 | 6.00 | 8.00 | 11.00 | ▇▅▅▇▁ |
D4_Region_C | 0 | 1.00 | 7.25 | 3.39 | 1.00 | 4.00 | 7.00 | 10.00 | 12.00 | ▃▅▃▃▇ |
D3_employment | 0 | 1.00 | 1.24 | 0.45 | 1.00 | 1.00 | 1.00 | 1.00 | 3.00 | ▇▁▂▁▁ |
E2 | 0 | 1.00 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
SCD_Opt_In_1 | 0 | 1.00 | 1.03 | 0.18 | 1.00 | 1.00 | 1.00 | 1.00 | 2.00 | ▇▁▁▁▁ |
SCD_Opt_In_2 | 0 | 1.00 | 1.07 | 0.26 | 1.00 | 1.00 | 1.00 | 1.00 | 2.00 | ▇▁▁▁▁ |
SCD_Opt_In_3 | 0 | 1.00 | 1.04 | 0.20 | 1.00 | 1.00 | 1.00 | 1.00 | 2.00 | ▇▁▁▁▁ |
Ethnicity | 343 | 0.97 | 2.89 | 4.26 | 1.00 | 1.00 | 1.00 | 1.00 | 21.00 | ▇▁▁▁▁ |
D6_educ1 | 0 | 1.00 | 1.47 | 0.52 | 1.00 | 1.00 | 1.00 | 2.00 | 3.00 | ▇▁▆▁▁ |
D7_educ2_1 | 5527 | 0.46 | 0.56 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
D7_educ2_2 | 5527 | 0.46 | 0.26 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
D7_educ2_3 | 5527 | 0.46 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
D7_educ2_4 | 5527 | 0.46 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D7_educ2_5 | 5527 | 0.46 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D7_educ2_6 | 5527 | 0.46 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D7_educ2_7 | 5527 | 0.46 | 0.22 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
D7_educ2_8 | 5527 | 0.46 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
D7_educ2_9 | 5527 | 0.46 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D7_educ2_10 | 5527 | 0.46 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D7_educ2_11 | 5527 | 0.46 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D7_educ2_12 | 5527 | 0.46 | 0.05 | 0.22 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D7_educ2_13 | 5527 | 0.46 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
TradeUnion | 737 | 0.93 | 1.71 | 0.48 | 1.00 | 1.00 | 2.00 | 2.00 | 3.00 | ▃▁▇▁▁ |
D6_Children_1 | 0 | 1.00 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D6_Children_2 | 0 | 1.00 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D6_Children_3 | 0 | 1.00 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D6_Children_4 | 0 | 1.00 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D6_Children_5 | 0 | 1.00 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D6_Children_6 | 0 | 1.00 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D6_Children_7 | 0 | 1.00 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
D6_Children_8 | 0 | 1.00 | 0.26 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
D6_Children_9 | 0 | 1.00 | 0.39 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
E1 | 0 | 1.00 | 1.88 | 0.33 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00 | ▁▁▁▁▇ |
E3 | 0 | 1.00 | 1.07 | 0.44 | 1.00 | 1.00 | 1.00 | 1.00 | 5.00 | ▇▁▁▁▁ |
E5 | 0 | 1.00 | 1.11 | 0.39 | 1.00 | 1.00 | 1.00 | 1.00 | 3.00 | ▇▁▁▁▁ |
E6_1 | 9658 | 0.05 | 0.31 | 0.46 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
E6_2 | 9658 | 0.05 | 0.60 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
E6_3 | 9658 | 0.05 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
E6_4 | 9658 | 0.05 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
E6_5 | 9658 | 0.05 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
E7A | 1 | 1.00 | 6.16 | 2.70 | 1.00 | 5.00 | 6.00 | 9.00 | 9.00 | ▃▂▂▆▇ |
E7B | 0 | 1.00 | 1.61 | 0.49 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00 | ▅▁▁▁▇ |
E9B | 0 | 1.00 | 12.67 | 5.62 | 1.00 | 8.00 | 14.00 | 17.00 | 22.00 | ▂▅▃▇▃ |
E10A | 0 | 1.00 | 1.33 | 0.47 | 1.00 | 1.00 | 1.00 | 2.00 | 2.00 | ▇▁▁▁▃ |
E10B | 6827 | 0.33 | 4.55 | 2.03 | 1.00 | 3.00 | 4.00 | 6.00 | 9.00 | ▂▇▂▆▁ |
Q1_1 | 0 | 1.00 | 1.84 | 0.37 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00 | ▂▁▁▁▇ |
Q1_2 | 0 | 1.00 | 1.81 | 0.39 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00 | ▂▁▁▁▇ |
Q1_3 | 0 | 1.00 | 1.67 | 0.47 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00 | ▃▁▁▁▇ |
Q1_4 | 0 | 1.00 | 1.79 | 0.41 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00 | ▂▁▁▁▇ |
Q1_5 | 0 | 1.00 | 1.80 | 0.40 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00 | ▂▁▁▁▇ |
Q1_6 | 0 | 1.00 | 1.71 | 0.46 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00 | ▃▁▁▁▇ |
Q2 | 0 | 1.00 | 1.13 | 0.38 | 1.00 | 1.00 | 1.00 | 1.00 | 3.00 | ▇▁▁▁▁ |
Q3v3a | 0 | 1.00 | 2.78 | 0.57 | 1.00 | 3.00 | 3.00 | 3.00 | 3.00 | ▁▁▁▁▇ |
Q3v3b | 9383 | 0.08 | 2.48 | 0.82 | 1.00 | 2.00 | 3.00 | 3.00 | 3.00 | ▂▁▁▁▇ |
Q3v3c | 9435 | 0.07 | 2.35 | 0.77 | 1.00 | 2.00 | 3.00 | 3.00 | 3.00 | ▃▁▅▁▇ |
Q3v3d | 1492 | 0.85 | 2.90 | 0.41 | 1.00 | 3.00 | 3.00 | 3.00 | 3.00 | ▁▁▁▁▇ |
D7_Disability1 | 443 | 0.96 | 1.82 | 0.47 | 1.00 | 2.00 | 2.00 | 2.00 | 3.00 | ▂▁▇▁▁ |
D8_Disability2 | 8052 | 0.21 | 2.13 | 0.67 | 1.00 | 2.00 | 2.00 | 3.00 | 4.00 | ▂▇▁▃▁ |
INCOME_FREQ | 0 | 1.00 | 1.76 | 0.85 | 1.00 | 1.00 | 2.00 | 2.00 | 4.00 | ▇▇▁▂▁ |
INCOME_OPEN_1 | 2656 | 0.74 | 21132.05 | 29878.94 | 6.00 | 1516.50 | 13500.00 | 33000.00 | 750000.00 | ▇▁▁▁▁ |
INCOME_OPEN_2 | 7500 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
INCOME_CLOSED_ANNUAL | 9315 | 0.08 | 8.96 | 3.38 | 1.00 | 6.00 | 11.00 | 12.00 | 12.00 | ▁▂▂▂▇ |
INCOME_CLOSED_MONTHLY | 8748 | 0.14 | 7.86 | 4.12 | 1.00 | 4.00 | 8.00 | 12.00 | 12.00 | ▃▃▂▁▇ |
INCOME_CLOSED_WEEKLY | 9884 | 0.03 | 7.49 | 4.29 | 1.00 | 4.00 | 7.00 | 12.00 | 12.00 | ▅▃▂▁▇ |
INCOME_CLOSED_HOURLY | 10018 | 0.01 | 9.11 | 4.73 | 1.00 | 3.00 | 13.00 | 13.00 | 13.00 | ▃▂▁▁▇ |
HOURS | 0 | 1.00 | 34.40 | 9.18 | 4.00 | 30.00 | 37.50 | 40.00 | 84.00 | ▂▃▇▁▁ |
BORNUK | 0 | 1.00 | 1.73 | 1.97 | 1.00 | 1.00 | 1.00 | 1.00 | 10.00 | ▇▁▁▁▁ |
Q3v1_1 | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v1_2 | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v1_3 | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v1_4 | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v1_5 | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v2_1 | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v2_2 | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v4a | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
Q3v4b | 10155 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
NatRepemployees | 0 | 1.00 | 1.00 | 0.59 | 0.29 | 0.66 | 0.79 | 1.09 | 6.74 | ▇▁▁▁▁ |
NatRepemployeesGraduate | 0 | 1.00 | 1.00 | 0.33 | 0.34 | 0.76 | 1.00 | 1.12 | 3.89 | ▇▃▁▁▁ |
skim(data_2)
Name | data_2 |
Number of rows | 10155 |
Number of columns | 86 |
_______________________ | |
Column type frequency: | |
character | 7 |
numeric | 79 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
MIProRspId | 0 | 1 | 36 | 36 | 0 | 10155 | 0 |
E6_5_other | 0 | 1 | 0 | 102 | 10142 | 14 | 0 |
E8A | 0 | 1 | 2 | 124 | 0 | 4946 | 0 |
E8B | 0 | 1 | 2 | 274 | 0 | 6559 | 0 |
E9A | 0 | 1 | 2 | 185 | 0 | 4369 | 0 |
E10B_9_other | 0 | 1 | 0 | 75 | 9992 | 132 | 0 |
Q2_3_other | 0 | 1 | 0 | 93 | 9988 | 140 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
D1_Gender_C | 0 | 1.00 | 1.53 | 0.54 | 1.00 | 1.00 | 2.00 | 2.00 | 4.00e+00 | ▇▇▁▁▁ |
D2_Age | 0 | 1.00 | 42.15 | 13.03 | 16.00 | 32.00 | 42.00 | 53.00 | 8.10e+01 | ▅▇▇▅▁ |
D4_Region_C | 0 | 1.00 | 7.25 | 3.39 | 1.00 | 4.00 | 7.00 | 10.00 | 1.20e+01 | ▃▅▃▃▇ |
D3_employment | 0 | 1.00 | 1.24 | 0.45 | 1.00 | 1.00 | 1.00 | 1.00 | 3.00e+00 | ▇▁▂▁▁ |
E2 | 0 | 1.00 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00e+00 | ▁▁▇▁▁ |
SCD_Opt_In_1 | 0 | 1.00 | 1.03 | 0.18 | 1.00 | 1.00 | 1.00 | 1.00 | 2.00e+00 | ▇▁▁▁▁ |
SCD_Opt_In_2 | 0 | 1.00 | 1.07 | 0.26 | 1.00 | 1.00 | 1.00 | 1.00 | 2.00e+00 | ▇▁▁▁▁ |
SCD_Opt_In_3 | 0 | 1.00 | 1.04 | 0.20 | 1.00 | 1.00 | 1.00 | 1.00 | 2.00e+00 | ▇▁▁▁▁ |
Ethnicity | 343 | 0.97 | 2.89 | 4.26 | 1.00 | 1.00 | 1.00 | 1.00 | 2.10e+01 | ▇▁▁▁▁ |
D6_educ1 | 0 | 1.00 | 1.47 | 0.52 | 1.00 | 1.00 | 1.00 | 2.00 | 3.00e+00 | ▇▁▆▁▁ |
D7_educ2_1 | 5527 | 0.46 | 0.56 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00e+00 | ▆▁▁▁▇ |
D7_educ2_2 | 5527 | 0.46 | 0.26 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00e+00 | ▇▁▁▁▃ |
D7_educ2_3 | 5527 | 0.46 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▂ |
D7_educ2_4 | 5527 | 0.46 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D7_educ2_5 | 5527 | 0.46 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D7_educ2_6 | 5527 | 0.46 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D7_educ2_7 | 5527 | 0.46 | 0.22 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▂ |
D7_educ2_8 | 5527 | 0.46 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▂ |
D7_educ2_9 | 5527 | 0.46 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D7_educ2_10 | 5527 | 0.46 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D7_educ2_11 | 5527 | 0.46 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D7_educ2_12 | 5527 | 0.46 | 0.05 | 0.22 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D7_educ2_13 | 5527 | 0.46 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
ed_bands | 0 | 1.00 | 2.43 | 0.68 | 1.00 | 2.00 | 3.00 | 3.00 | 3.00e+00 | ▂▁▅▁▇ |
TradeUnion | 737 | 0.93 | 1.71 | 0.48 | 1.00 | 1.00 | 2.00 | 2.00 | 3.00e+00 | ▃▁▇▁▁ |
D6_Children_1 | 0 | 1.00 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D6_Children_2 | 0 | 1.00 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D6_Children_3 | 0 | 1.00 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D6_Children_4 | 0 | 1.00 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D6_Children_5 | 0 | 1.00 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D6_Children_6 | 0 | 1.00 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D6_Children_7 | 0 | 1.00 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
D6_Children_8 | 0 | 1.00 | 0.26 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00e+00 | ▇▁▁▁▃ |
D6_Children_9 | 0 | 1.00 | 0.39 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00e+00 | ▇▁▁▁▅ |
E1 | 0 | 1.00 | 1.88 | 0.33 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00e+00 | ▁▁▁▁▇ |
E3 | 0 | 1.00 | 1.07 | 0.44 | 1.00 | 1.00 | 1.00 | 1.00 | 5.00e+00 | ▇▁▁▁▁ |
E5 | 0 | 1.00 | 1.11 | 0.39 | 1.00 | 1.00 | 1.00 | 1.00 | 3.00e+00 | ▇▁▁▁▁ |
E6_1 | 9658 | 0.05 | 0.31 | 0.46 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00e+00 | ▇▁▁▁▃ |
E6_2 | 9658 | 0.05 | 0.60 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00e+00 | ▆▁▁▁▇ |
E6_3 | 9658 | 0.05 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
E6_4 | 9658 | 0.05 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
E6_5 | 9658 | 0.05 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00e+00 | ▇▁▁▁▁ |
E7A | 1 | 1.00 | 6.16 | 2.70 | 1.00 | 5.00 | 6.00 | 9.00 | 9.00e+00 | ▃▂▂▆▇ |
E7B | 0 | 1.00 | 1.61 | 0.49 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00e+00 | ▅▁▁▁▇ |
E9B | 0 | 1.00 | 12.67 | 5.62 | 1.00 | 8.00 | 14.00 | 17.00 | 2.20e+01 | ▂▅▃▇▃ |
E10A | 0 | 1.00 | 1.33 | 0.47 | 1.00 | 1.00 | 1.00 | 2.00 | 2.00e+00 | ▇▁▁▁▃ |
E10B | 6827 | 0.33 | 4.55 | 2.03 | 1.00 | 3.00 | 4.00 | 6.00 | 9.00e+00 | ▂▇▂▆▁ |
Q1_1 | 0 | 1.00 | 1.84 | 0.37 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00e+00 | ▂▁▁▁▇ |
Q1_2 | 0 | 1.00 | 1.81 | 0.39 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00e+00 | ▂▁▁▁▇ |
Q1_3 | 0 | 1.00 | 1.67 | 0.47 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00e+00 | ▃▁▁▁▇ |
Q1_4 | 0 | 1.00 | 1.79 | 0.41 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00e+00 | ▂▁▁▁▇ |
Q1_5 | 0 | 1.00 | 1.80 | 0.40 | 1.00 | 2.00 | 2.00 | 2.00 | 2.00e+00 | ▂▁▁▁▇ |
Q1_6 | 0 | 1.00 | 1.71 | 0.46 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00e+00 | ▃▁▁▁▇ |
Q2 | 0 | 1.00 | 1.13 | 0.38 | 1.00 | 1.00 | 1.00 | 1.00 | 3.00e+00 | ▇▁▁▁▁ |
Q3v3a | 0 | 1.00 | 2.78 | 0.57 | 1.00 | 3.00 | 3.00 | 3.00 | 3.00e+00 | ▁▁▁▁▇ |
Q3v3b | 9383 | 0.08 | 2.48 | 0.82 | 1.00 | 2.00 | 3.00 | 3.00 | 3.00e+00 | ▂▁▁▁▇ |
Q3v3c | 9435 | 0.07 | 2.35 | 0.77 | 1.00 | 2.00 | 3.00 | 3.00 | 3.00e+00 | ▃▁▅▁▇ |
Q3v3d | 1492 | 0.85 | 2.90 | 0.41 | 1.00 | 3.00 | 3.00 | 3.00 | 3.00e+00 | ▁▁▁▁▇ |
D7_Disability1 | 443 | 0.96 | 1.82 | 0.47 | 1.00 | 2.00 | 2.00 | 2.00 | 3.00e+00 | ▂▁▇▁▁ |
D8_Disability2 | 8052 | 0.21 | 2.13 | 0.67 | 1.00 | 2.00 | 2.00 | 3.00 | 4.00e+00 | ▂▇▁▃▁ |
INCOME_FREQ | 0 | 1.00 | 1.76 | 0.85 | 1.00 | 1.00 | 2.00 | 2.00 | 4.00e+00 | ▇▇▁▂▁ |
INCOME_OPEN_1 | 2656 | 0.74 | 21132.05 | 29878.94 | 6.00 | 1516.50 | 13500.00 | 33000.00 | 7.50e+05 | ▇▁▁▁▁ |
INCOME_OPEN_2 | 7500 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00e+00 | ▁▁▇▁▁ |
INCOME_CLOSED_ANNUAL | 9315 | 0.08 | 8.96 | 3.38 | 1.00 | 6.00 | 11.00 | 12.00 | 1.20e+01 | ▁▂▂▂▇ |
INCOME_CLOSED_MONTHLY | 8748 | 0.14 | 7.86 | 4.12 | 1.00 | 4.00 | 8.00 | 12.00 | 1.20e+01 | ▃▃▂▁▇ |
INCOME_CLOSED_WEEKLY | 9884 | 0.03 | 7.49 | 4.29 | 1.00 | 4.00 | 7.00 | 12.00 | 1.20e+01 | ▅▃▂▁▇ |
INCOME_CLOSED_HOURLY | 10018 | 0.01 | 9.11 | 4.73 | 1.00 | 3.00 | 13.00 | 13.00 | 1.30e+01 | ▃▂▁▁▇ |
HOURS | 0 | 1.00 | 34.40 | 9.18 | 4.00 | 30.00 | 37.50 | 40.00 | 8.40e+01 | ▂▃▇▁▁ |
BORNUK | 0 | 1.00 | 1.73 | 1.97 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00e+01 | ▇▁▁▁▁ |
unit_code | 1 | 1.00 | 446.62 | 251.04 | 0.00 | 231.00 | 412.00 | 623.00 | 9.99e+02 | ▃▇▅▅▃ |
UnitOccupation | 1 | 1.00 | 52.72 | 29.69 | 1.00 | 30.00 | 49.00 | 83.00 | 1.04e+02 | ▆▇▅▇▇ |
smg_code | 1 | 1.00 | 44.37 | 25.12 | 0.00 | 23.00 | 41.00 | 62.00 | 9.90e+01 | ▃▇▅▅▃ |
MajorsubgroupOccupation | 1 | 1.00 | 11.06 | 8.31 | 1.00 | 3.00 | 9.00 | 18.00 | 2.70e+01 | ▇▂▂▃▂ |
major_code | 1 | 1.00 | 4.24 | 2.53 | 0.00 | 2.00 | 4.00 | 6.00 | 9.00e+00 | ▃▇▅▅▃ |
Majorgroupcode | 1 | 1.00 | 5.21 | 3.00 | 1.00 | 2.00 | 5.00 | 8.00 | 1.00e+01 | ▇▅▃▇▅ |
SectorCode | 1 | 1.00 | 12.16 | 5.29 | 1.00 | 7.00 | 13.00 | 17.00 | 2.20e+01 | ▃▇▅▇▆ |
SectorName | 1 | 1.00 | 13.34 | 6.30 | 1.00 | 8.00 | 12.00 | 19.00 | 2.30e+01 | ▂▃▇▂▆ |
NatRepemployees | 0 | 1.00 | 1.00 | 0.59 | 0.29 | 0.66 | 0.79 | 1.09 | 6.74e+00 | ▇▁▁▁▁ |
NatRepemployeesGraduate | 0 | 1.00 | 1.00 | 0.33 | 0.34 | 0.76 | 1.00 | 1.12 | 3.89e+00 | ▇▃▁▁▁ |
In this section, we clean the second dataset to make it ready for analysis.
# Just use the new dataset
# remove unncessary data objects, then read new data (keep colour palette)
rm(list = setdiff(ls(), "colours"))
<- read_sav("../Data/JRF Outsourced Workers - Occupations and Sectors.sav") data
# Change column names
<- data %>%
data rename(
ID = MIProRspId,
Gender = D1_Gender_C,
Age = D2_Age,
Region = D4_Region_C,
Employment_Status = D3_employment,
Employment_Type = E2,
Consent_1_Ethnicity = SCD_Opt_In_1,
Consent_2_TU = SCD_Opt_In_2,
Consent_3_Health = SCD_Opt_In_3,
Has_Degree = D6_educ1,
Has_Second_Job = E1,
Who_Pays_You = E3,
Job_Security = E5,
Work_Circumstance_Agency = E6_1,
Work_Circumstance_Contract = E6_2,
Work_Circumstance_Seasonal = E6_3,
Work_Circumstance_Trainee = E6_4,
Work_Circumstance_Other = E6_5,
Work_Circumstance_Other_TEXT = E6_5_other,
Org_Size = E7A,
Is_Supervisor = E7B,
Job_Title_TEXT = E8A,
Main_Activity_TEXT = E8B,
Employer_Activity_TEXT = E9A,
Employer_Activity_SIC = E9B, #need to check that these were SIC and translate to digits if so
Biz_Type = E10A,
Non_Private_Biz_Type = E10B,
Non_Private_Biz_Type_TEXT = E10B_9_other,
# Don't rename the indicator questions and outsourcing questions variables
# because it creates more work when running statistics (i.e., remembering
# and typing out these longer var names)
# Paid_One_Work_Other = Q1_1,
# Third_Party = Q1_2,
# Employer_Is_Agency = Q1_3,
# Instructed_By_Other = Q1_4,
# Work_In_Other = Q1_5,
# Diff_Uniform = Q1_6,
# Short_Long_Employ = Q2,
# Short_Long_Employ_TEXT = Q2_3_other,
# I_Am_Outsourced = Q3v3a,
# Outsourced_And_Agency = Q3v3b,
# Might_Be_Outsourced_And_Agency = Q3v3c,
# Not_Outsourced_And_Agency = Q3v3d,
Disability = D7_Disability1,
Disability_Impact = D8_Disability2,
#some form of multiplication between INCOME_FREQ and INCOME_OPEN_1 is necessary to create equivalence to an annual salary
)
# drop unwanted columns
<- data %>%
data select(-c(starts_with(c("D6_", "D7_"))))
As of 17 February 2024, the definitions are:
Outsourced, defined as responding ‘I am sure I am outsourced’ or ‘I might be outsourced’, and responding ‘I do work on a long-term basis’.
Likely agency, defined as those responding ‘I am sure I am agency’ and ‘I do work on a long-term basis’, excluding those people who are already defined as being outsourced.
High indicators: defined as responding TRUE to 5 or 6 of the outsourcing indicators, as well as responding ‘I do work on a long-term basis’, excluding those people who are already defined as outsourced or likely agency.
# calculate indicator data
# invert response function for summing
<- function(x){
invert_response <- 2 + (-1*x)
x
}
# Now for just indicators
<- data %>%
indicator_data select(ID, starts_with("Q1")) %>%
reframe(across(starts_with("Q1"), ~invert_response(.x)), .by = ID) %>%
rowwise() %>%
mutate(
sum_true = sum(Q1_1, Q1_2, Q1_3, Q1_4, Q1_5, Q1_6)
%>%
) select(-starts_with("Q1"))
# Check ID the same
paste("IDs match: ", sum(data$ID == indicator_data$ID) == nrow(data))
[1] "IDs match: TRUE"
# Merge into main data set
<- left_join(data, indicator_data, by = "ID")
data
# remove redundant indicator data
rm(indicator_data)
#------------------------#
#### Assign to groups ####
#------------------------#
# As of 17 February, the definitions are:
#
# - **Outsourced**, defined as responding 'I am sure I am outsourced' or 'I might be outsourced', and responding 'I do work on a long-term basis'.
# - **Likely agency**, defined as those responding 'I am sure I am agency' and 'I do work on a long-term basis', **excluding** those people who are already defined as being outsourced.
# - **High indicators**: defined as responding TRUE to 5 or 6 of the outsourcing indicators, as well as responding 'I do work on a long-term basis', **excluding** those people who are already defined as outsourced or likely agency.
<- data %>%
data mutate(
# SURE outsourced or MIGHT BE outsourced + LONGTERM
outsourced = ifelse((Q3v3a == 1 & Q2 == 1) | (Q3v3a == 2 & Q2 == 1), 1, 0),
# NOT outsourced, SURE agency, and LONG-TERM
likely_agency = ifelse(outsourced == 0 & Q2 == 1 & (Q3v3b == 1 | Q3v3c == 1 | Q3v3d == 1), 1, 0),
likely_agency = ifelse(is.na(likely_agency), 0, likely_agency),
# NOT outsourced, NOT likely agency, 5 or more indicators, & LONGTERM
high_indicators = ifelse(outsourced == 0 & likely_agency == 0 & (Q2 == 1 & sum_true >= 5), 1, 0)
)
# count the groupings
print("Totals: 1 = Outsourced, 2 = Likely agency, 3 = High indicators")
[1] "Totals: 1 = Outsourced, 2 = Likely agency, 3 = High indicators"
lapply(list(data$outsourced,
$likely_agency,
data$high_indicators), sum) data
[[1]]
[1] 1123
[[2]]
[1] 269
[[3]]
[1] 291
# Flatten these groupings into a single variable
<- data %>%
data mutate(
# outsroucing group is subdivided 'type' of outsourcing,
# i.e., straight outsourced, likely agency, high indicators
outsourcing_group = factor(case_when(outsourced == 1 ~ 'Outsourced',
== 1 ~ 'Likely agency',
likely_agency == 1 ~ 'High indicators',
high_indicators TRUE ~ 'Not outsourced'),
levels = c("Not outsourced",
"Outsourced",
"Likely agency",
"High indicators")
),# outsourcing status is the binary split, anyone who is one of the not
# outsourced groups is outsourced, everyone else is not outsourced
outsourcing_status = factor(case_when(outsourced == 1 | likely_agency == 1 | high_indicators == 1 ~ 'Outsourced',
TRUE ~ 'Not outsourced'),
levels = c('Not outsourced',
'Outsourced')
) )
# Age to numeric
# Remove 'under 16' level
<- data %>%
data mutate(
# Get labels
Age = haven::as_factor(Age),
# Drop under 16s as we're interested in adult labour market
Age = droplevels(Age, exclude = "Under 16"),
# There are 2 responses that are 'over 80'. Change this to 80 so we can have
# a numeric variable for age.
# Set Age to numeric
Age = as.numeric(case_when(
== "Over 80" ~ "80",
Age TRUE ~ Age)
) )
Applying haven::as_factor() to the whole dataset is inappropriate because it makes data wrangling a lot more cumbersome; for example, writing out labels for conditioning becomes very long, and plots become very busy with many level names (e.g., for occupation variables, ethnicity, etc.). However, some variables need to be have their labels extracted because these vars could ostensibly be numeric ones, and leaving them in their unlabelled state risks confusion later on (e.g., with age, org size). It is therefore necessary to translate the numeric-type variables to their labels, replacing the old values, whilst for the factor-like variables to keep a version of the variable unlabelled.
1: Variables to translate to labels:
2: Variables to keep as is and create a labelled duplicate:
3: All other variables leave as they are
Consider dropping:
glimpse(data)
# Translate the variables in category 1 above, i.e. translate to labels
# Note age is omitted here because we've done something different with it earlier
<- c("Gender",
variables_to_labels "Region",
"Employment_Type",
"ed_bands",
"Has_Degree",
"TradeUnion",
"Has_Second_Job",
colnames(select(data, contains("Work_Circumstance"))),
"Org_Size",
"Is_Supervisor",
"Biz_Type",
"Disability",
"Disability_Impact",
"smg_code",
"major_code",
"SectorCode"
)
# test to see if this method works
# test <- data
#
# # This works:
# test[variables_to_labels] <- sapply(test[variables_to_labels], haven::as_factor)
# Check it works:
# test <- test %>%
# relocate(new_age, .after = Age) %>%
# relocate(new_gender, .after = Gender)
<- sapply(data[variables_to_labels], haven::as_factor) data[variables_to_labels]
<- c("Ethnicity",
variables_to_duplicate "Employment_Status",
"Who_Pays_You",
"Job_Security",
"Employer_Activity_SIC",
"Non_Private_Biz_Type",
colnames(select(data, contains("Q1"))),
"Q2",
colnames(select(data, contains("Q3"))),
colnames(select(data, contains("INCOME_CLOSED"))),
"BORNUK",
"unit_code",
"UnitOccupation",
"MajorsubgroupOccupation",
"Majorgroupcode",
"SectorName")
# Translate the variables in category 1 above. i.e., create labelled duplicates
<- function(data, variables){
duplicate_label for(i in variables){
#print(data[i])
<- data[i]
old_var <- paste0(i,"_labelled") # append labelled to new var name
new_var <- haven::as_factor(old_var) # set new var as labelled old var
data[new_var]
# locate new var after old var for easy viewing
<- data %>%
data relocate(
.after = colnames(old_var)
new_var,
)
}return(data)
}
# Test this works. It does
# test <- data
# test <- duplicate_label(test, variables_to_duplicate)
<- duplicate_label(data, variables_to_duplicate) data
We have many ethnicity categories, which is great for disaggregation but challenging for visualising and determining general trends. Here we make a second ethnicity category that collapses ethnicity into fewer, broader categories. The super-categories are formed as follows:
White
White other
Black African
Black Caribbean
Black other
South Asian
East Asian
Arab
Mixed other
Other
<- data %>%
data mutate(
Ethnicity_collapsed = forcats::fct_collapse(Ethnicity_labelled,
"White" = c("English / Welsh / Scottish / Northern Irish / British",
"Irish"),
"White other" = c("Gypsy or Irish Traveller",
"Roma",
"Any other White background"),
"Black African" = c("African",
"White and Black African"),
"Black Caribbean" = c("Caribbean",
"White and Black Caribbean"),
"Black other" = c("Any other Black, Black British, or Caribbean background"),
"South Asian" = c("Indian",
"Pakistani",
"Bangladeshi"),
"East Asian" = c("Chinese"),
"Arab" = "Arab",
"Mixed other" = c("White and Asian",
"Any other Mixed / Multiple ethnic background"),
"Other" = c("Any other ethnic group",
"Any other Asian background",
"Don’t think of myself as any of these",
"Prefer not to say")
),Ethnicity_collapsed = replace_na(Ethnicity_collapsed, "Other"), # allocate NA responses to 'Other' group
Ethnicity_collapsed = forcats::fct_relevel(Ethnicity_collapsed, "White") # make White the reference category
%>%
) relocate(Ethnicity_collapsed, .after = Ethnicity_labelled) # put the new var after the old one
Income has an open question and a closed question. They are mutually exclusive, i.e., respondents can report their income in either open or closed format, but not both:
# Select just the income columns (and ID)
<- data %>%
data_subset select(c(ID, contains("INCOME", ignore.case = F))) %>%
select(!ends_with("labelled") & !ends_with("FREQ") & !ends_with("2"))
# Count how many NAs there are in each row. If any respondent has responded to both OPEN and CLOSED questions, the number of NAs in the row should be less than 4.
<- data_subset %>%
na_count summarise(
ID = ID,
NA_per_row = rowSums(is.na(.))
)
# No rows have less than 4 NAs
sum(na_count$NA_per_row < 4)
[1] 0
# one row has five NAS - they didn't answer ANY income question
sum(na_count$NA_per_row > 4)
[1] 1
# Take a look at this respondent to check
<- na_count$ID[which(na_count$NA_per_row > 4)]
test_id which(data_subset$ID == test_id),] data_subset[
# A tibble: 1 × 6
ID INCOME_OPEN_1 INCOME_CLOSED_ANNUAL INCOME_CLOSED_MONTHLY
<chr> <dbl> <dbl+lbl> <dbl+lbl>
1 b9ace7e0-2129-49a3-9… NA NA NA
# ℹ 2 more variables: INCOME_CLOSED_WEEKLY <dbl+lbl>,
# INCOME_CLOSED_HOURLY <dbl+lbl>
<- data_subset %>%
number_open filter(!is.na(INCOME_OPEN_1)) %>%
nrow()
<- data_subset %>%
number_closed filter(is.na(INCOME_OPEN_1)) %>%
select(c(ID, contains("CLOSED"))) %>%
nrow()
# but also check how many "prefer not to say"
<- data %>%
closed_income filter(is.na(INCOME_OPEN_1)) %>%
select(c(ID, contains("INCOME", ignore.case = F))) %>%
select(ends_with("labelled"))
<- closed_income %>%
pref_not_say rowwise() %>%
summarise(
sum = sum(c_across(everything()) == "Prefer not to say", na.rm=TRUE)
%>%
) summarise(
total_not_answered = sum(sum)
)
<- number_closed - pref_not_say$total_not_answered closed_answered
7499 respondents answered using the open method. 1445 respondents answered using the closed method. 1211 did not answer either.
For the open question, the participant is asked how they want to report their income. To get this to a consistent income period for all participants, we need to create a single income variable that computes the income based on period. This is done by multiplying monthly income by 12, weekly income by 52, and hourly income by hours by 52. The assumption of a 52 working week year is obviously a problem that probably presents inaccuracies for those people reporting on a weekly or hourly basis, whose employment may be more likely to be ad hoc than those reporting annual salaries.
# work out min holiday entitlement and use this to calculate working weeks
<- 365 / 7 # if we're being pedantic (this is 52.14 weeks)
weeks_in_year <- 28
min_holiday_entitlement <- min_holiday_entitlement/5
non_working_weeks <- weeks_in_year - non_working_weeks
working_weeks
<- data %>%
data # get values of labels
# mutate_all(haven::as_factor) %>%
mutate(
# make all annual incomes. Note this assumes 52 working weeks!
income_annual = case_when(INCOME_FREQ == 1 ~ INCOME_OPEN_1, # annual
== 2 ~ INCOME_OPEN_1*12, # monthly
INCOME_FREQ == 3 ~ INCOME_OPEN_1 * working_weeks, # weekly
INCOME_FREQ == 4 ~ INCOME_OPEN_1 * HOURS * working_weeks, # hourly
INCOME_FREQ TRUE ~ NA),
income_weekly = case_when(INCOME_FREQ == 1 ~ INCOME_OPEN_1 / working_weeks, # annual
== 2 ~ (INCOME_OPEN_1 * 12) / working_weeks, # monthly
INCOME_FREQ == 3 ~ INCOME_OPEN_1, # weekly
INCOME_FREQ == 4 ~ INCOME_OPEN_1 * HOURS, # hourly
INCOME_FREQ TRUE ~ NA),
income_hourly = income_weekly / HOURS,
income_annual_percentile = ntile(income_annual, 100),
income_weekly_percentile = ntile(income_weekly, 100),
income_hourly_percentile = ntile(income_hourly, 100)
)
options(scipen = 999)
# Check
# test <- data %>%
# filter(!is.na(INCOME_OPEN_1)) %>%
# select(c(INCOME_FREQ,INCOME_OPEN_1, income_annual, income_weekly, HOURS,income_hourly)) %>%
# mutate(
# INCOME_FREQ = haven::as_factor(INCOME_FREQ)
# ) %>%
# group_by(INCOME_FREQ) %>%
# slice_head(n=2)
The range of salaries is massive, with extreme values at either end. This suggests to me that respondents may have entered in their salaries incorrectly.
# filter to just cases where income is above the fifth percentile and lower than the 95th? I.e., drop the top and bottom 5%.
<- data %>%
income_statistics group_by(outsourcing_status) %>%
summarise(
mean = weighted.mean(income_annual, w = NatRepemployees, na.rm = T),
median = wtd.quantile(income_annual, w = NatRepemployees, probs = c(.5), na.rm = T),
min = wtd.quantile(income_annual, w = NatRepemployees, probs = c(0), na.rm = T),
max = wtd.quantile(income_annual, w = NatRepemployees, probs = c(1), na.rm = T),
stdev = sqrt(wtd.var(income_annual, w = NatRepemployees, na.rm = T))
)
The distribution of income is highly skewed, with extremely high values creating a tail. It is likely that many extreme values are artefacts of the method of reporting. Below, we identify outliers as values 3 standard deviations above or below the mean. We do this iteratively, removing identified outliers and retesting, until no more outliers are identified.
Outliers are given a marker (income_drop == 1
) to identify them. In analyses involving income, these outliers can be removed by only selecting cases where income_drop == 0
.
# this is removing outliers using annual income
# x <- data
#
# # calculate the mean and sd, and 3x sd
# mean = Hmisc::wtd.mean(x$income_annual, weights=x$NatRepemployees, na.rm=T)
# std = sqrt(Hmisc::wtd.var(x$income_annual,weights=x$NatRepemployees,na.rm = T))
# Tmin = mean-(3*std)
# Tmax = mean+(3*std)
#
# # identify the first set of outliers
# outliers <- which(x$income_annual < Tmin | x$income_annual > Tmax)
# outlier_count <- length(outliers) # count how many removed this iteration
# all_outlier_ids <- x$ID[outliers] # list of ids removed (to be removed from data later)
#
# # initialise some variables, i.e. iteration 0
# count <- 0 # iteration counter
# total_cases_removed <- 0 # total cases removed
#
# cat("Iteration ", count, ": ", outlier_count, " outliers\n")
#
# # Take a look at the distribution
# par(mfrow = c(1, 3)) # this lets us plot three plots in a row
# hist(x$income_annual, main = "Histogram")
# # mark the mean and sds for the histogram
# abline(v = mean, col='red', lwd = 3)
# abline(v = Tmin, col='blue', lwd = 3)
# abline(v = Tmax, col='blue', lwd = 3)
# boxplot(x$income_annual, main = "Boxplot")
# qqnorm(x$income_annual, main = "Normal Q-Q plot")
# mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
#
# # While there are still outliers detected, remove the outliers and recalculate
# # mean, sd, and 3*sd and remove the outliers based on these new figures.
# while(length(outliers) != 0){
# count <- count + 1
# x <- x[-outliers,] # remove the outliers identified in the previous iteration
#
# # recalculate
# mean = Hmisc::wtd.mean(x$income_annual, weights=x$NatRepemployees, na.rm=T)
# std = sqrt(Hmisc::wtd.var(x$income_annual,weights=x$NatRepemployees,na.rm = T))
# Tmin = mean - (3 * std)
# Tmax = mean + (3 * std)
# outliers <- which(x$income_annual < Tmin | x$income_annual > Tmax)
# outlier_ids <- x$ID[outliers] # get outlier ids
# all_outlier_ids <- append(all_outlier_ids, outlier_ids) # add removed outliers to outlier list
# total_cases_removed <- total_cases_removed + outlier_count # count total
#
# outlier_count <- length(outliers) # count how many removed this iteration
# cat("Iteration ", count, ": ", outlier_count, " outliers\n")
#
# # Replot distributions to see how they've changed
# par(mfrow = c(1, 3))
# hist(x$income_annual, main = "Histogram")
# abline(v = mean, col='red', lwd = 3)
# abline(v = Tmin, col='blue', lwd = 3)
# abline(v = Tmax, col='blue', lwd = 3)
# boxplot(x$income_annual, main = "Boxplot")
# qqnorm(x$income_annual, main = "Normal Q-Q plot")
# mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
# }
# cat(total_cases_removed, " cases removed in total, across ", count, " iterations")
#
# # Drop the cases identified as outliers from data
# data$income_drop[which(data$ID %in% all_outlier_ids)] <- 1
# data$income_drop[is.na(data$income_drop)] <- 0 # make NAs 0
#
# # Check this looks right
# sum(data$income_drop, na.rm = T) # this should be equal to all_outlier_ids
#
# # This should look the same as the last iteration above#
# test <- data %>%
# filter(income_drop == 0)
#
# mean = Hmisc::wtd.mean(x$income_annual, weights=x$NatRepemployees, na.rm=T)
# std = sqrt(Hmisc::wtd.var(x$income_annual,weights=x$NatRepemployees,na.rm = T))
# Tmin = mean - (3 * std)
# Tmax = mean + (3 * std)
#
# par(mfrow = c(1, 3))
# hist(test$income_annual, main = "Histogram")
# abline(v = mean, col='red', lwd = 3)
# abline(v = Tmin, col='blue', lwd = 3)
# abline(v = Tmax, col='blue', lwd = 3)
# boxplot(test$income_annual, main = "Boxplot")
# qqnorm(test$income_annual, main = "Normal Q-Q plot")
# mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
# this is removing outliers using weekly income (it results in the same number of cases removed)
<- data
x
# calculate the mean and sd, and 3x sd
= Hmisc::wtd.mean(x$income_weekly, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_weekly,weights=x$NatRepemployees,na.rm = T))
std = mean-(3*std)
Tmin = mean+(3*std)
Tmax
# identify the first set of outliers
<- which(x$income_weekly < Tmin | x$income_weekly > Tmax)
outliers <- length(outliers) # count how many removed this iteration
outlier_count <- x$ID[outliers] # list of ids removed (to be removed from data later)
all_outlier_ids
# initialise some variables, i.e. iteration 0
<- 0 # iteration counter
count <- 0 # total cases removed
total_cases_removed
cat("Iteration ", count, ": ", outlier_count, " outliers\n")
Iteration 0 : 159 outliers
# Take a look at the distribution
par(mfrow = c(1, 3)) # this lets us plot three plots in a row
hist(x$income_weekly, main = "Histogram")
# mark the mean and sds for the histogram
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_weekly, main = "Boxplot")
qqnorm(x$income_weekly, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
# While there are still outliers detected, remove the outliers and recalculate
# mean, sd, and 3*sd and remove the outliers based on these new figures.
while(length(outliers) != 0){
<- count + 1
count <- x[-outliers,] # remove the outliers identified in the previous iteration
x
# recalculate
= Hmisc::wtd.mean(x$income_weekly, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_weekly,weights=x$NatRepemployees,na.rm = T))
std = mean - (3 * std)
Tmin = mean + (3 * std)
Tmax <- which(x$income_weekly < Tmin | x$income_weekly > Tmax)
outliers <- x$ID[outliers] # get outlier ids
outlier_ids <- append(all_outlier_ids, outlier_ids) # add removed outliers to outlier list
all_outlier_ids <- total_cases_removed + outlier_count # count total
total_cases_removed
<- length(outliers) # count how many removed this iteration
outlier_count cat("Iteration ", count, ": ", outlier_count, " outliers\n")
# Replot distributions to see how they've changed
par(mfrow = c(1, 3))
hist(x$income_weekly, main = "Histogram")
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_weekly, main = "Boxplot")
qqnorm(x$income_weekly, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
}
Iteration 1 : 148 outliers
Iteration 2 : 160 outliers
Iteration 3 : 94 outliers
Iteration 4 : 64 outliers
Iteration 5 : 18 outliers
Iteration 6 : 6 outliers
Iteration 7 : 1 outliers
Iteration 8 : 0 outliers
cat(total_cases_removed, " cases removed in total, across ", count, " iterations")
650 cases removed in total, across 8 iterations
# Drop the cases identified as outliers from data
$income_drop[which(data$ID %in% all_outlier_ids)] <- 1
data$income_drop[is.na(data$income_drop)] <- 0 # make NAs 0
data
# Check this looks right
sum(data$income_drop, na.rm = T) # this should be equal to all_outlier_ids
[1] 650
# This should look the same as the last iteration above#
<- data %>%
test filter(income_drop == 0)
= Hmisc::wtd.mean(x$income_weekly, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_weekly,weights=x$NatRepemployees,na.rm = T))
std = mean - (3 * std)
Tmin = mean + (3 * std)
Tmax
par(mfrow = c(1, 3))
hist(test$income_weekly, main = "Histogram")
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(test$income_weekly, main = "Boxplot")
qqnorm(test$income_weekly, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
As shown above, the process involved 8 iterations, and in total identified 650 observations which should be removed when examining income data (total non-NA income_annual N after removal = 6849). The resulting income distribution is now more sensibly distributed.
The function below extracts the midpoint of the income brackets, or the value of the “less than” and “over” values.
<- data %>%
closed_income filter(is.na(INCOME_OPEN_1)) %>%
select(c(ID, contains("INCOME", ignore.case = F))) %>%
select(ends_with("labelled"))
<- function(data) {
convert_to_numeric # Select the closed income columns. We only want the labelled ones.
<- grep("CLOSED", colnames(data), ignore.case = FALSE)
matching_columns <- matching_columns[grep("labelled", colnames(data)[matching_columns], ignore.case = FALSE)]
matching_columns <- colnames(data)[matching_columns]
matching_column_names
#column <- "INCOME_CLOSED_ANNUAL_labelled"
# Iterate through the closed variables
# For each one, split its range into two separate values and calculate the midpoint,
# or for "less than" and "over", just take the value and use that
for(column in matching_column_names){
<- data %>%
temp_df #mutate(copy = INCOME_CLOSED_ANNUAL_labelled) %>% # just for testing
separate_wider_delim(column, names = c("min_income","max_income"), delim=" up to £", too_few = "align_start")
<- temp_df %>%
temp_df mutate(across(c(min_income, max_income), ~gsub("£","",.))) %>% # sub out the £ sign
# This is for extracting the less than and over values as well subbing out the comma separators
# regex = 1-3 digits, comma, three digits, decimal point any number of digits
# In same operation, we also sub out the comma separator
mutate(
min_income = gsub(",","", str_extract(min_income, "\\d{1,3}(?:,\\d{3})*(?:\\.\\d+)?"))
%>%
) # sub out the comma separator again (this is just going to be for max_income, but leave in min_income anyway)
mutate(across(c(min_income, max_income), ~gsub(",","",.))) %>%
mutate(across(c(min_income, max_income), as.numeric)) %>% # note we lose 'prefer not to say' here
# Where we have both min and max, calculate the midpoint, otherwise, just keep min
mutate(midpoint = case_when(!is.na(min_income) & !is.na(max_income) ~ (min_income + max_income) / 2,
TRUE ~ min_income)
)
# Add the new variable into the data
<- data %>%
data mutate(
midpoint = temp_df$midpoint, .after = column
)# make a sensible colname
colnames(data)[colnames(data) == "midpoint"] <- paste0(column, "_midpoint")
}
return(data)
}
<- convert_to_numeric(data) data
Now calculate annual income in the same way as we did for the open questions
<- data %>%
data # get values of labels
# mutate_all(haven::as_factor) %>%
mutate(
# make all annual incomes. Note this assumes 52 working weeks!
income_annual_closed = case_when(INCOME_FREQ == 1 ~ INCOME_CLOSED_ANNUAL_labelled_midpoint,
== 2 ~ INCOME_CLOSED_MONTHLY_labelled_midpoint*12,
INCOME_FREQ == 3 ~ INCOME_CLOSED_WEEKLY_labelled_midpoint*working_weeks,
INCOME_FREQ == 4 ~ INCOME_CLOSED_HOURLY_labelled_midpoint*HOURS*working_weeks,
INCOME_FREQ TRUE ~ NA),
income_weekly_closed = case_when(INCOME_FREQ == 1 ~ INCOME_CLOSED_ANNUAL_labelled_midpoint / working_weeks,
== 2 ~ (INCOME_CLOSED_MONTHLY_labelled_midpoint*12) / working_weeks,
INCOME_FREQ == 3 ~ INCOME_CLOSED_WEEKLY_labelled_midpoint,
INCOME_FREQ == 4 ~ INCOME_CLOSED_HOURLY_labelled_midpoint*HOURS,
INCOME_FREQ TRUE ~ NA),
income_hourly_closed = income_weekly_closed / HOURS,
income_percentile_closed = ntile(income_annual, 100),
income_weekly_closed_percentile = ntile(income_weekly_closed, 100),
income_hourly_closed_percentile = ntile(income_hourly_closed, 100),
)
Next, combine the open and closed into a single variable.
<- data %>%
data mutate(
income_annual_all = case_when(!is.na(income_annual) ~ income_annual,
!is.na(income_annual_closed) ~ income_annual_closed,
TRUE ~ NA),
income_weekly_all = case_when(!is.na(income_weekly) ~ income_weekly,
!is.na(income_weekly_closed) ~ income_weekly_closed,
TRUE ~ NA),
income_hourly_all = case_when(!is.na(income_hourly) ~ income_hourly,
!is.na(income_hourly_closed) ~ income_hourly_closed,
TRUE ~ NA)
)
# Double check this leads to what we expect
# Yes, this provides the same number of non-na values as calculated above,
# i.e., 1213 total nas: 1212 from closed reponses, 1 from open responses
sum(!is.na(data$income_annual_all))
[1] 8943
sum(is.na(data$income_annual_all))
[1] 1212
sum(!is.na(data$income_weekly_all))
[1] 8943
sum(is.na(data$income_weekly_all))
[1] 1212
sum(!is.na(data$income_hourly_all))
[1] 8943
sum(is.na(data$income_hourly_all))
[1] 1212
Apply the same outliers removal process as done above for the income_all_subset
# annual income all - same result as weekly
<- data
x
# calculate the mean and sd, and 3x sd
= Hmisc::wtd.mean(x$income_annual_all, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_annual_all,weights=x$NatRepemployees,na.rm = T))
std = mean-(3*std)
Tmin = mean+(3*std)
Tmax
# identify the first set of outliers
<- which(x$income_annual_all < Tmin | x$income_annual_all > Tmax)
outliers <- length(outliers) # count how many removed this iteration
outlier_count <- x$ID[outliers] # list of ids removed (to be removed from data later)
all_outlier_ids
# initialise some variables, i.e. iteration 0
<- 0 # iteration counter
count <- 0 # total cases removed
total_cases_removed
cat("Iteration ", count, ": ", outlier_count, " outliers\n")
Iteration 0 : 173 outliers
# Take a look at the distribution
par(mfrow = c(1, 3)) # this lets us plot three plots in a row
hist(x$income_annual_all, main = "Histogram")
# mark the mean and sds for the histogram
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_annual_all, main = "Boxplot")
qqnorm(x$income_annual_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
# While there are still outliers detected, remove the outliers and recalculate
# mean, sd, and 3*sd and remove the outliers based on these new figures.
while(length(outliers) != 0){
<- count + 1
count <- x[-outliers,] # remove the outliers identified in the previous iteration
x
# recalculate
= Hmisc::wtd.mean(x$income_annual_all, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_annual_all,weights=x$NatRepemployees,na.rm = T))
std = mean - (3 * std)
Tmin = mean + (3 * std)
Tmax <- which(x$income_annual_all < Tmin | x$income_annual_all > Tmax)
outliers <- x$ID[outliers] # get outlier ids
outlier_ids <- append(all_outlier_ids, outlier_ids) # add removed outliers to outlier list
all_outlier_ids <- total_cases_removed + outlier_count # count total
total_cases_removed
<- length(outliers) # count how many removed this iteration
outlier_count cat("Iteration ", count, ": ", outlier_count, " outliers\n")
# Replot distributions to see how they've changed
par(mfrow = c(1, 3))
hist(x$income_annual_all, main = "Histogram")
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_annual_all, main = "Boxplot")
qqnorm(x$income_annual_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
}
Iteration 1 : 155 outliers
Iteration 2 : 180 outliers
Iteration 3 : 88 outliers
Iteration 4 : 46 outliers
Iteration 5 : 9 outliers
Iteration 6 : 1 outliers
Iteration 7 : 0 outliers
cat(total_cases_removed, " cases removed in total, across ", count, " iterations")
652 cases removed in total, across 7 iterations
# Drop the cases identified as outliers from data
$income_drop_all[which(data$ID %in% all_outlier_ids)] <- 1
data$income_drop_all[is.na(data$income_drop_all)] <- 0 # make NAs 0
data
# Check this looks right
sum(data$income_drop_all, na.rm = T) # this should be equal to all_outlier_ids
[1] 652
# This should look the same as the last iteration above#
<- data %>%
test filter(income_drop_all == 0)
= Hmisc::wtd.mean(x$income_annual_all, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_annual_all,weights=x$NatRepemployees,na.rm = T))
std = mean - (3 * std)
Tmin = mean + (3 * std)
Tmax
par(mfrow = c(1, 3))
hist(test$income_annual_all, main = "Histogram")
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(test$income_annual_all, main = "Boxplot")
qqnorm(test$income_annual_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
# income weekly all
<- data
x
# calculate the mean and sd, and 3x sd
= Hmisc::wtd.mean(x$income_weekly_all, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_weekly_all,weights=x$NatRepemployees,na.rm = T))
std = mean-(3*std)
Tmin = mean+(3*std)
Tmax
# identify the first set of outliers
<- which(x$income_weekly_all < Tmin | x$income_weekly_all > Tmax)
outliers <- length(outliers) # count how many removed this iteration
outlier_count <- x$ID[outliers] # list of ids removed (to be removed from data later)
all_outlier_ids
# initialise some variables, i.e. iteration 0
<- 0 # iteration counter
count <- 0 # total cases removed
total_cases_removed
cat("Iteration ", count, ": ", outlier_count, " outliers\n")
Iteration 0 : 173 outliers
# Take a look at the distribution
par(mfrow = c(1, 3)) # this lets us plot three plots in a row
hist(x$income_weekly_all, main = "Histogram")
# mark the mean and sds for the histogram
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_weekly_all, main = "Boxplot")
qqnorm(x$income_weekly_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
# While there are still outliers detected, remove the outliers and recalculate
# mean, sd, and 3*sd and remove the outliers based on these new figures.
while(length(outliers) != 0){
<- count + 1
count <- x[-outliers,] # remove the outliers identified in the previous iteration
x
# recalculate
= Hmisc::wtd.mean(x$income_weekly_all, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_weekly_all,weights=x$NatRepemployees,na.rm = T))
std = mean - (3 * std)
Tmin = mean + (3 * std)
Tmax <- which(x$income_weekly_all < Tmin | x$income_weekly_all > Tmax)
outliers <- x$ID[outliers] # get outlier ids
outlier_ids <- append(all_outlier_ids, outlier_ids) # add removed outliers to outlier list
all_outlier_ids <- total_cases_removed + outlier_count # count total
total_cases_removed
<- length(outliers) # count how many removed this iteration
outlier_count cat("Iteration ", count, ": ", outlier_count, " outliers\n")
# Replot distributions to see how they've changed
par(mfrow = c(1, 3))
hist(x$income_weekly_all, main = "Histogram")
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(x$income_weekly_all, main = "Boxplot")
qqnorm(x$income_weekly_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
}
Iteration 1 : 155 outliers
Iteration 2 : 180 outliers
Iteration 3 : 88 outliers
Iteration 4 : 46 outliers
Iteration 5 : 9 outliers
Iteration 6 : 1 outliers
Iteration 7 : 0 outliers
cat(total_cases_removed, " cases removed in total, across ", count, " iterations")
652 cases removed in total, across 7 iterations
# Drop the cases identified as outliers from data
$income_drop_all[which(data$ID %in% all_outlier_ids)] <- 1
data$income_drop_all[is.na(data$income_drop_all)] <- 0 # make NAs 0
data
# Check this looks right
sum(data$income_drop_all, na.rm = T) # this should be equal to all_outlier_ids
[1] 652
# This should look the same as the last iteration above#
<- data %>%
test filter(income_drop_all == 0)
= Hmisc::wtd.mean(x$income_weekly_all, weights=x$NatRepemployees, na.rm=T)
mean = sqrt(Hmisc::wtd.var(x$income_weekly_all,weights=x$NatRepemployees,na.rm = T))
std = mean - (3 * std)
Tmin = mean + (3 * std)
Tmax
par(mfrow = c(1, 3))
hist(test$income_weekly_all, main = "Histogram")
abline(v = mean, col='red', lwd = 3)
abline(v = Tmin, col='blue', lwd = 3)
abline(v = Tmax, col='blue', lwd = 3)
boxplot(test$income_weekly_all, main = "Boxplot")
qqnorm(test$income_weekly_all, main = "Normal Q-Q plot")
mtext(paste0("Iteration ", count, ": ", outlier_count, " outlier(s)"), side = 1, line = -2, outer = TRUE)
After further discussion and consultation with JRF analysts, who felt the 80% threshold was high, we have reverted to use 2/3 ASHE regional weekly pay as the threshold. This is also consistent with what JRF have done in the past and what the OECD uses (see https://data.oecd.org/earnwage/wage-levels.htm).
# get the ashe data
# from https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/earningsandhoursworkedukregionbyagegroup
<- 'https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/earningsandhoursworkedukregionbyagegroup/2023provisional/ashetableregionbyage2023provisional.zip'
url
<- basename(url) # this takes the filename from the url
filename <- paste0("../data/", filename)
filepath <- substr(filepath,start = 1, stop=nchar(filepath) - 4)
output_dir
# check if the file exists. if it doesn't, download and unzip
if(!file.exists(filepath)){
cat("Downloading data\n")
download.file(url, destfile = filepath, mode = "wb")
unzip(filepath,exdir=output_dir)
else{
} cat("Data already in directory. Loading it.\n")
}
Data already in directory. Loading it.
# Specify the directory
# Use list.files to get all CSV files in the directory
<- list.files(path = output_dir, pattern = '* Weekly pay - Gross 2023.xls$', full.names = TRUE)
files
<- readxl::read_excel(files[1], sheet = 'All', skip = 4) %>%
ashe_data filter(!is.na(Code)) %>%
#select(-c(last_col(offset=2):last_col(), contains('change'))) %>% # if we want some other variables
::clean_names() %>%
janitorrename(
#jobs_thousands = thousand,
Region = description,
Region_median_income = median
%>%
) select(Region, Region_median_income) %>% # if we just want the median
# rename problematic regions
mutate(
Region = case_when(Region == "East" ~ "East of England",
== "Yorkshire and The Humber" ~ "Yorkshire and the Humber",
Region TRUE ~ Region),
Region_median_income = as.numeric(Region_median_income)
)
# join to data
<- data %>%
data left_join(., ashe_data, by = "Region")
# calcualte 4/5 thresholds
<- data %>%
data mutate(
Region_four_fifths = Region_median_income * .8,
Region_two_thirds = Region_median_income * (2/3),
UK_median = ashe_data %>% filter(Region == "United Kingdom") %>% pull(Region_median_income),
UK_four_fifths = UK_median * 0.8
)
# stats by region
<- data %>%
income_statistics_regional filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
group_by(Region) %>%
summarise(
mean = weighted.mean(income_weekly_all, w = NatRepemployees, na.rm = T),
median = wtd.quantile(income_weekly_all, w = NatRepemployees, probs = c(.5), na.rm = T),
min = wtd.quantile(income_weekly_all, w = NatRepemployees, probs = c(0), na.rm = T),
max = wtd.quantile(income_weekly_all, w = NatRepemployees, probs = c(1), na.rm = T),
stdev = sqrt(wtd.var(income_weekly_all, w = NatRepemployees, na.rm = T)),
n = n()
)
par(mar = c(2, 2, 2, 2))
# plot the distribution of income
# blue lines are regional values, red lines are UK values
# lighter lines are the actual medians, darker lines are 4/5 medians
# data %>%
# filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
# ggplot(., aes(x="",y=income_weekly_all)) +
# facet_wrap(~Region) +
# geom_violin() +
# geom_boxplot(width = 0.3) +
# geom_hline(aes(yintercept = UK_median), colour = "red", alpha=0.25) +
# geom_hline(aes(yintercept = UK_four_fifths), colour = "red") +
# geom_hline(aes(yintercept = Region_median_income), colour = "blue", alpha = 0.25) +
# geom_hline(aes(yintercept = Region_four_fifths), colour = "blue") +
# theme_minimal()
# let's take a look at two-thrids too
# here blue lines are 2/3, red lines are 4/5
%>%
data filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
ggplot(., aes(x="",y=income_weekly_all)) +
facet_wrap(~Region) +
geom_violin() +
geom_boxplot(width = 0.3) +
geom_hline(aes(yintercept = Region_four_fifths), colour = "red") +
geom_hline(aes(yintercept = Region_two_thirds), colour = "blue") +
theme_minimal()
# use 2/3 becuase that's what OECD use and what JRF have used in past
# https://data.oecd.org/earnwage/wage-levels.htm
# create the grouping variable based on Region_two_thirds
# low pay is less than or equal to 2/3 regional median earnigns
<- data %>%
data mutate(
income_group = case_when(income_weekly_all <= Region_two_thirds ~ "Low",
> Region_two_thirds ~ "Not low",
income_weekly_all TRUE ~ NA),
income_group = factor(income_group, levels = c("Not low","Low"), exclude=NA)
%>%
) ungroup()
# visualise it to check
# 26.77% of people are low paid
%>%
data filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
::group_by(income_group) %>%
dplyr::summarise(
dplyrn = sum(NatRepemployees)
%>%
) mutate(
perc = 100 * (n / sum(n))
%>%
) ggplot(aes("", perc, fill=income_group)) +
geom_col() +
theme_minimal() +
#theme(plot.margin = unit(c(0,4,0,4), "inches")) +
geom_label(aes(label=paste0(round(perc,2),"%")))
## comparing outsoruced to not outsourced
## 25.88% NO are low paid, 31.2% O are low paid
%>%
data filter(income_drop == 0 & !is.na(income_weekly_all)) %>%
::group_by(outsourcing_status, income_group) %>%
dplyr::summarise(
dplyrn = sum(NatRepemployees)
%>%
) mutate(
perc = 100 * (n / sum(n))
%>%
) ggplot(aes(outsourcing_status, perc, fill=income_group)) +
geom_col() +
theme_minimal() +
#theme(plot.margin = unit(c(0,4,0,4), "inches")) +
geom_label(aes(label=paste0(round(perc,2),"%")))
# one last check to see people have been categorised correctly
# test <- data %>%
# filter(income_drop==0) %>% # why do we still get nas for annual income?
# select(Annual_Income, Region_four_fifths, income_group) %>%
# mutate(
# test = case_when(Annual_Income < Region_four_fifths ~ "Low",
# Annual_Income >= Region_four_fifths ~ "Not low",
# TRUE ~ NA)
# )
# # check that they're the same - value should be 0
# sum(test$income_group != test$test, na.rm=T)
# test significance
<- glm(income_group ~ outsourcing_status, data, family="binomial", weights = NatRepemployees)
mod summary(mod)
Call:
glm(formula = income_group ~ outsourcing_status, family = "binomial",
data = data, weights = NatRepemployees)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.13900 0.02717 -41.918 < 0.0000000000000002
outsourcing_statusOutsourced 0.21801 0.06292 3.465 0.00053
(Intercept) ***
outsourcing_statusOutsourced ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 10002.1 on 8942 degrees of freedom
Residual deviance: 9990.4 on 8941 degrees of freedom
(1212 observations deleted due to missingness)
AIC: 10890
Number of Fisher Scoring iterations: 4
<- summary(mod)
test
<- exp(mod[["coefficients"]][["outsourcing_statusOutsourced"]])
or <- test[["coefficients"]][2,4] p
The difference is confirmed statistically. An outsourced person is 1.24 times more likely to be in the low income group compared to a non-outsourced person (p = 0.001). In the full analysis we can control for other factors when testing this relationship (e.g., age, gender, etc).
When reporting income, respondents could choose whether to report income using an open text method or a bracket method. All income-based work in this report so far has used only the open text methods. We may be interested to combine data from the CLOSED questions with the OPEN ones, although there is not a clear method for doing so. Possible options are:
<- data %>%
income_subset select(INCOME_FREQ, contains("INCOME_OPEN"), income_annual, contains("INCOME_CLOSED_")) %>%
mutate(
INCOME_FREQ = haven::as_factor(INCOME_FREQ)
)
# Check whether any of the income closed variables has non-missing on same rows as income open (derived). There are none.
<- colnames(select(income_subset, contains("INCOME_CLOSED")))
vars for(i in vars){
cat(paste0(i,": ",sum(!is.na(income_subset$income_annual) & !is.na(income_subset[i]))), "\n")
}
INCOME_CLOSED_ANNUAL: 0
INCOME_CLOSED_ANNUAL_labelled: 0
INCOME_CLOSED_ANNUAL_labelled_midpoint: 0
INCOME_CLOSED_MONTHLY: 0
INCOME_CLOSED_MONTHLY_labelled: 0
INCOME_CLOSED_MONTHLY_labelled_midpoint: 0
INCOME_CLOSED_WEEKLY: 0
INCOME_CLOSED_WEEKLY_labelled: 0
INCOME_CLOSED_WEEKLY_labelled_midpoint: 0
INCOME_CLOSED_HOURLY: 0
INCOME_CLOSED_HOURLY_labelled: 0
INCOME_CLOSED_HOURLY_labelled_midpoint: 0
# Skim indicates completion of closed weekly/hourly is really low, but monthyl/annual is a bit better. We might want to try and combine somehow. Think about code to exract hte numbers from brackets, then take the midpoint of the two numbers. Extract after '£', to ' '. But into two separate elements to start
skim(income_subset)
Name | income_subset |
Number of rows | 10155 |
Number of columns | 16 |
_______________________ | |
Column type frequency: | |
factor | 5 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
INCOME_FREQ | 0 | 1.00 | FALSE | 4 | Ann: 4515, Mon: 4215, Wee: 764, Hou: 661 |
INCOME_CLOSED_ANNUAL_labelled | 9315 | 0.08 | FALSE | 12 | Pre: 380, £22: 74, £28: 60, £16: 58 |
INCOME_CLOSED_MONTHLY_labelled | 8748 | 0.14 | FALSE | 12 | Pre: 635, £1,: 157, £94: 125, £1,: 120 |
INCOME_CLOSED_WEEKLY_labelled | 9884 | 0.03 | FALSE | 12 | Pre: 118, £33: 32, £11: 29, £22: 24 |
INCOME_CLOSED_HOURLY_labelled | 10018 | 0.01 | FALSE | 11 | Pre: 78, £10: 31, £12: 9, £14: 6 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
INCOME_OPEN_1 | 2656 | 0.74 | 21132.05 | 29878.94 | 6.00 | 1516.5 | 13500.0 | 33000.0 | 750000 | ▇▁▁▁▁ |
INCOME_OPEN_2 | 7500 | 0.26 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0 | ▁▁▇▁▁ |
income_annual | 2656 | 0.74 | 46884.73 | 104977.77 | 1440.00 | 19200.0 | 27588.0 | 40000.0 | 2559857 | ▇▁▁▁▁ |
INCOME_CLOSED_ANNUAL | 9315 | 0.08 | 8.96 | 3.38 | 1.00 | 6.0 | 11.0 | 12.0 | 12 | ▁▂▂▂▇ |
INCOME_CLOSED_ANNUAL_labelled_midpoint | 9695 | 0.05 | 33070.85 | 14140.52 | 5600.00 | 25200.5 | 30800.5 | 47600.5 | 56000 | ▃▇▃▆▇ |
INCOME_CLOSED_MONTHLY | 8748 | 0.14 | 7.86 | 4.12 | 1.00 | 4.0 | 8.0 | 12.0 | 12 | ▃▃▂▁▇ |
INCOME_CLOSED_MONTHLY_labelled_midpoint | 9383 | 0.08 | 1873.08 | 1043.83 | 470.00 | 1175.5 | 1645.5 | 2585.5 | 4700 | ▇▇▂▂▂ |
INCOME_CLOSED_WEEKLY | 9884 | 0.03 | 7.49 | 4.29 | 1.00 | 4.0 | 7.0 | 12.0 | 12 | ▅▃▂▁▇ |
INCOME_CLOSED_WEEKLY_labelled_midpoint | 10002 | 0.02 | 390.74 | 228.44 | 110.00 | 165.0 | 385.5 | 495.5 | 1100 | ▇▆▂▂▁ |
INCOME_CLOSED_HOURLY | 10018 | 0.01 | 9.11 | 4.73 | 1.00 | 3.0 | 13.0 | 13.0 | 13 | ▃▂▁▁▇ |
INCOME_CLOSED_HOURLY_labelled_midpoint | 10096 | 0.01 | 13.13 | 4.60 | 8.91 | 11.0 | 11.0 | 13.0 | 30 | ▇▂▁▁▁ |
Feedback from 8 May was to incorporate the closed responses with the open responses. This has been done in the section above Converting closed responses to continuous.
Create a variable for shortened sector names to make them nicer to read.
<- data %>%
data mutate(
SectorName_short = SectorName_labelled
%>%
) # make the sector names more readable
separate_wider_delim(SectorName_short, names = c("SectorName_short", "SectorName_short_detail"), delim=";",
too_few = "align_start") %>%
mutate(
SectorName_short = factor(stringr::str_to_sentence(SectorName_short)),
SectorName_short_detail = factor(stringr::str_to_sentence(SectorName_short_detail))
)
Make region a factor and make the reference London.
<- data %>%
data mutate(
Region = forcats::fct_relevel(factor(Region), "London")
)
# Make our disaggregated ethnicity groups identical to the original census groups
# as per request in JRF - Outsourced workers - To dos
$Ethnicity_collapsed_disaggregated <- data$Ethnicity_labelled
data
<- data %>%
data mutate(
Ethnicity_collapsed = case_when(
# Grouping White ethnicities
%in% c(1) ~ "White British", # Just white british as reference
Ethnicity %in% c(2, 3, 4, 5) ~ "White other", # Gyspy, Irish traveller, Roma, Irish and White other grouped together
Ethnicity # Grouping Asian ethnicities
%in% c(10,11,12,13,14) ~ "Asian/Asian British",
Ethnicity # Grouping Black ethnicities
%in% c(15,16,17) ~ "Black/African/Caribbean/Black British",
Ethnicity # Grouping Mixed ethnicities
%in% c(6,7,8,9) ~ "Mixed/Multiple ethnic group",
Ethnicity # Grouping Other ethnicities
%in% c(18) ~ "Arab/British Arab",
Ethnicity # Handling missing or ambiguous categories
%in% c(19) ~ "Other ethnic group",
Ethnicity #prefer not to say
%in% c(20,21) ~ "Prefer not to say",
Ethnicity # Default case for any unmatched entries
TRUE ~ "Prefer not to say"
)
)
#make white the reference category
$Ethnicity_collapsed <- relevel(factor(data$Ethnicity_collapsed), ref = "White British")
data
<- data %>%
data mutate(
Has_Degree = factor(Has_Degree, levels = c("No", "Yes", "Don't know"))
)
# make binary born uk var
<- as.vector(unique(data$BORNUK_labelled))
categories <- categories[!(categories %in% "I was born in the UK")]
non_categories
# Will throw NA warning. I think this OK but investigate how to avoid the problem
<- data %>%
data mutate(
BORNUK_binary = forcats::fct_collapse(BORNUK_labelled,
"Born in UK" = "I was born in the UK",
"Not born in UK" = non_categories)
)
# make binary ethnicity var
<- as.vector(unique(data$Ethnicity_collapsed))
ethnicities <- ethnicities[!(ethnicities %in% "White British")]
non_white_ethnicities
# Will throw NA warning. I think this OK but investigate how to avoid the problem
<- data %>%
data mutate(
Ethnicity_binary = forcats::fct_collapse(Ethnicity_collapsed,
"White British" = c("White British"),
"Non-White British" = non_white_ethnicities)
)
<- readline("Save data? (y/n)") check
Save data? (y/n)
if(check=="y"){
saveRDS(data, file = paste0("../Data/", format(Sys.Date(), "%Y-%m-%d"), " - Cleaned_Data.rds"))
# haven::write_sav(as.data.frame(data), path = paste0("../Data/", format(Sys.Date(), "%Y-%m-%d"), " - Cleaned_Data.sav"))
write_csv(data, file = paste0("../Data/", format(Sys.Date(), "%Y-%m-%d"), " - Cleaned_Data.csv"))
print("File saved")
else{
} print("File not saved")
}
[1] "File not saved"