knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

1 Preface

This document describes how to conduct initial data analysis in the context of longitudinal regression analyses. The project is described in the draft:

Initial data analysis for longitudinal studies to build a solid foundation for reproducible analysis

by Lara Lusa, Marianne Huebner, Carsten O. Schmidt, Katherine J. Lee, Saskia le Cessie, Mark Baillie, Frank Lawrence, C'ecile Proust-Lima, on behalf of TG3 of the STRATOS Initiative

#setting language to English
invisible(Sys.setlocale("LC_TIME", "C"))
# defining some additional functions that will be used in the document


color <- function(x, color = "red") {
  if (knitr::is_latex_output()) {
    sprintf("\\textcolor{%s}{%s}", color, x)
  } else if (knitr::is_html_output()) {
    sprintf("<span style='color: %s;'>%s</span>", color,
            x)
  } else
    x
}

#can be used inline to change the color of the text, default is red
#`r color("try red color")`, `r color("try blue color", "blue")`


#to reduce the fontsize of the chunks
def.chunk.hook  <- knitr::knit_hooks$get("chunk")
knitr::knit_hooks$set(
  chunk = function(x, options) {
    x <- def.chunk.hook(x, options)
    ifelse(
      options$size != "normalsize",
      paste0("\n \\", options$size, "\n\n", x, "\n\n \\normalsize"),
      x
    )
  }
)




# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <-
  function(...,
           plotlist = NULL,
           file,
           cols = 1,
           layout = NULL) {
    library(grid)
    
    # Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)
    
    numPlots = length(plots)
    
    # If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
      # Make the panel
      # ncol: Number of columns of plots
      # nrow: Number of rows needed, calculated from # of cols
      layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
                       ncol = cols,
                       nrow = ceiling(numPlots / cols))
    }
    
    if (numPlots == 1) {
      print(plots[[1]])
      
    } else {
      # Set up the page
      grid.newpage()
      pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
      
      # Make each plot, in the correct location
      for (i in 1:numPlots) {
        # Get the i,j matrix positions of the regions that contain this subplot
        matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
        
        print(plots[[i]],
              vp = viewport(
                layout.pos.row = matchidx$row,
                layout.pos.col = matchidx$col
              ))
      }
    }
  }


#function that converts the format of the rows of a data frame, digits specify how many digits will be stored/printed for each ROW!
digitsByRows <- function(df, digits) {
  tmp0 <- data.frame(t(df))
  tmp1 <- mapply(function(df0, digits0) {
    formatC(df0, format = "f", digits = digits0)
  },
  df0 = tmp0, digits0 = digits)
  tmp1 <- data.frame(t(tmp1))
  names(tmp1) <- names(df)
  
  return(tmp1)
}

2 Initial data analysis and data screening checklist for longitudinal data

Initial data analysis (IDA) is the part of the data pipeline that takes place between the end of data retrieval and the beginning of data analysis that addresses the research question.

The main aim of IDA is to provide reliable knowledge about the data to ensure transparency and integrity of preconditions to conduct appropriate statistical analyses and correct interpretation of the results to answer pre-defined research questions.

A general framework of IDA for observational studies was proposed to include the following six steps: (1) metadata setup (to summarize the background information about data), (2) data cleaning (to identify and correct technical errors), (3) data screening (to examine data properties), (4) initial data reporting (to document findings from the previous steps), (5) refining and updating the research analysis plan, and (6) documenting and reporting IDA in research papers. An IDA plan for data screening in the context of regression models for continuous, count or binary outcomes was proposed recently [Heinze et al. 2023].

The goal of this study is to provide recommendations for a reproducible workflow for IDA in longitudinal studies, where participants are measured repeatedly over time and the main research question is addressed using a regression model; the focus is on data screening (IDA step 3), where the examination of the data properties provide to the data analyst important information related to the intended analysis.

References

Huebner M, le Cessie S, Schmidt CO, Vach W . A contemporary conceptual framework for initial data analysis. Observational Studies 2018; 4: 171-192. Link

Huebner M, Vach W, le Cessie S, Schmidt C, Lusa L. Hidden Analyses: a review of reporting practice and recommendations for more transparent reporting of initial data analyses. BMC Med Res Meth 2020; 20:61. Link

Heinze G, Baillie M, Lusa L, Saurbrei W, Schmidt CO, Harrell FE Jr, Huebner M. Regression without regrets –initial data analysis is an essential prerequisite to multivariable regression Link

2.1 Pre-requisites

We assume that meta-data provide sufficient detail about data characteristics and that data cleaning has been performed.

We assume that the study protocol describes a research question that involves longitudinal data, where the outcome variable is measured repeatedly over time, and is analyzed using a regression model applied to all time points or measurements. We assume that baseline explanatory variables are measured, and consider also the possibility of time-varying explanatory variables. We also assume that the analysis strategy has been pre-specified (describing type of statistical model for longitudinal data, variables to be considered for the model, expected methods for handling missing data, and model performance measurements).

Structural variables [Heinze et al. 2023] in the context of IDA are variables that help to structure IDA results for a clear organization and essential overview of data properties. Structural variables can be demographic variables or variables central to the research aim that help to organize IDA results. The association between the structural variables and the explanatory variables is explored in data screening; they can be used also for stratification purposes in multivariable analyses.

2.2 Data screening checklist for longitudinal studies

We propose a checklist for data screening step of the IDA framework, which includes data summaries to help understanding data properties, and their potential impact on the analyses and interpretation of results. The checklist can be used for observational longitudinal studies, which include panel studies, cohort studies, or retrospective studies.

Checklist for data screening of longitudinal studies

Topic Item Features
IDA screening domain: Participation profile
Time frame P1 Provide number of time points and intervals at which measurements are taken, using the time metric that best reflects the time of inclusion in the study (typically time from enrollment, or calendar time in studies that involve long enrollment times). Highlight the differences between the time of first measurements and follow-up times.
Time metric P2 Describe the time metric and corresponding time points specified in the analysis strategy, if different from the time metric described in P1.
Participants P3 Provide the number of participants who attended the assessment by time metric(s).
Extensions: Participation Profile
Other time metrics PE1 Use different time metric(s) to describe the time frame of the study, if applicable and appropriate, e.g. calendar time or measurement occasion.
Data collection PE2 Describe aspects of the data collection process that can have an impact on the data, if applicable. For example, describe if baseline and longitudinal measurements are different, possible changes in variable measurements though time, etc.
IDA screening domain: Missing data
Non-enrollment M1 Describe the non-enrolled, i.e., the participants that were selected but did not enter the study (and the reasons, if available), if applicable.
Drop-out M2 Describe the participants who dropped out from the study during the follow-up (loss to follow-up and other possible reasons: death, withdrawal, missing by design, if applicable).
Intermittent visit missingness M3 Describe the participants that have missing data for some of the measurements (intermittent, occasional omission, but do not drop out out of the study).
Variable (item) missingness M4 Provide the number and proportion of missing values for each variable at each time point as appropriate for fixed or time-varying variables. Describe missingness stratifying the summaries by variables that might influence the frequency of missing values, if relevant (for example: structural variables or levels of measurement).
Patterns M5 Describe patterns of missing values across variables at each time point and across time points.
Extensions: Missing data
Non-enrollment ME1 Compare the characteristics of the participants that entered the study with those of the non-enrolled or with the characteristics of the target population, if applicable and data are available.
Probability of drop-out ME2 Estimate the probability of drop-out after inclusion, taking appropriately into account the reasons for drop-out.
Dropout effect on outcome ME3 Visualize mean profiles of a continuous outcome by time metric stratified by time to drop-out.
Predictors of missingness ME4 Explore whether there are predictors of missingness by comparing complete vs incomplete cases or investigate predictors of time to dropout, as appropriate; this can assist in understanding of the missing data mechanism.
IDA screening domain: Univariate descriptions
Description of the variables at baseline U1 Summarize the outcome variable and the explanatory variables with numerical and graphical summaries at baseline.
Description of the time-varying variables at later points U2 Summarize the outcome variable and the time-varying explanatory variables also at later time points. This might require discretization of time intervals and/or the use of different time metrics.
IDA screening domain: Multivariate descriptions
Association at baseline V1 Visualize the association between each explanatory variable with the structural variables at baseline.
Correlation at baseline V2 Quantify association with pairwise correlation coefficients between all explanatory variables in a matrix or heatmap at baseline.
Interactions at baseline, if applicable V3 Evaluate bivariate distributions of the variables specified in the analysis strategy with an interaction term; include appropriate graphical displays.
Extensions: Multivariate descriptions
Stratification VE1 Compute summary statistics and describe variation between strata defined based on level of measurement, e.g. centers, providers, locations, or by structural variables or other variables described as stratification variables in the analysis strategy (at baseline, other time points/time intervals can be also included).
Associations and correlations at time-points beyond baseline VE2 Associations and correlations between explanatory variables at time points later than baseline to explore their possible change across time; his could be useful for the identification of auxillary variables. Selection might bias the results.
Sampling design VE3 If relevant, identify the stratifying variables and/or the clusters used in the sampling design; explore the distribution of the number of clusters (by stratification variables).
IDA screening domain: Longitudinal aspects
Profiles L1 Summarize changes and variability of variables within subjects, e.g. profile plots (spaghetti-plots) for groups of individuals.
Trends L2 Describe numerically or graphically longitudinal(average) trends of the outcome variable.
Correlation and variability L3 Estimate the strength of the within-participant correlation of the outcome variable between time points and its variability across time points.
Trends of time-varying explanatory variables L4 Describe numerically or graphically the longitudinal trends of the time-varying variables.
Extensions: Longitudinal aspects
Cohort/Period effects LE1 If appropriate, summarize possible cohorts or period effects (for example, age birth cohorts or period cohorts defined by the calendar time/wave of measurement) on the outcome, and on the explanatory variables, to assess if the variation of the outcome can occur because of these effects.

3 SHARE data description

The Survey of Health Ageing and Retirement (SHARE) in Europe consists of data on health and socioeconomic variables of non-institutionalized individuals aged 50 and older across 28 European countries and Israel [Boersch et al]. The data sets includes about 140,000 men and women, ages 50 or older, collected in years 2004 to 2018. Waves to be analyzed are 1 to 7 (note that SHARELIFE interview data from waves 3 and 7 were considered in this analysis).

3.1 Data availability

SHARE data are publicly available for scientific purposes after registration, details about registration are provided here.

3.2 Subset of the SHARE data analyzed in this project

In this report we analyze the data collected in Denmark, where the sampling was based on simple random sampling. We also limit the sample to individuals age above 50 at baseline interview and do not consider the participants that were first included in Wave 3 or 7 (by design should not have been included).

The general characteristics of the SHARE study are described elsewhere (see the citations below). Here we report the only the main study characteristics.

3.3 Sampling in Denmark

The study protocol describes the age-related inclusion criteria by wave. Participants in Wave 1 had to be born in 1954 or before; the study design planned full range refreshment samples in Waves 2 (birth year <=1956) and 5 (birth year <=1962), and refreshment sample of the youngest cohort only in Wave 4 (birth years 1957-60) and 6 (birth years 1963-4). The full range refreshment sampling include an over-sampling of the youngest cohorts that were not age-eligible in the previous refreshment samples to maintain the representation of younger cohorts and their aim is to compensate for the effect of panel attrition on all age cohorts.

3.4 Non-enrollment

The characteristics of the subjects that were selected for participation but that did not enter the study are not reported in the data set. Limited information is available only in the documentation published by the study (retrievable here), while detailed description of the selected sample is not provided.

3.5 Type of questionnaire

Different types of questionnaire were used during the study. By design, the baseline questionnaire is used for the first interview, the longitudinal questionnaire for the follow-up interviews. The questionnaire used in the SHARELIFE interviews (Wave 3 and partly Wave 7) includes only a subset of the questions from the longitudinal questionnaire, and additional questions about the history of the life of the participants is collected.

3.6 Changes in data collection process

The questionnaires used in the study contain thousands of questions divided in modules. Several changes occurred in the questionnaires used during the study. The study documentation includes the description of the changes. It is important to note that some questions might provide different type of information based on the type of questionnaire being used and that careful examination of the metadata is required. Some information related to the definition of variables across waves and questionnaires were used to define the analysis strategy (AS) (for example, height was used as a time fixed variable, as for most of the waves it was recorded only in the baseline interviews). Given the large number of questions and waves, retrieving all the relevant information from metadata can be challenging.

References: Börsch-Supan A, Brandt M, Hunkler C, Kneip T, Korbmacher J, Malter F, et al. Data Resource Profile: The Survey of Health, Ageing and Retirement in Europe (SHARE). Int J Epidemiol. 2013;42: 992–1001

Website: www.share-eric.eu.

SHARE release guide 7.1.1

4 Data example (from research question to IDA plan)

In this section we describe the research question based on our data example and the prerequisites for the defintion of an IDA data screening plan (definition of an analysis strategy, data retrieval and management, data dictionary, data cleaning, the background knowledge). Later we define a detialed IDA data screening plan.

4.1 Prerequisites for IDA plan

The research question aims at assessing the age-associated decline of hand grip strength by sex, after adjusting for a set of explanatory variables that are known to be associated with the outcome (weight, height, education level, physical activity and smoking).

Below we give a basic overview of the corresponding statistical analysis strategy.

4.1.1 Analysis stretegy

Study population: individuals from Denmark aged 50 or older at baseline interview.

Outcome: maximum grip strength measured at different interviews (recorded with a hand-held dynamometer, assessed as the maximum score out of two measurements per hand).

Time metric: age at interview.

Time-fixed variables evaluated at baseline interview: sex, height and education (categorized in three levels).

Time-varying variables: weight, physical activity (vigorous or low intensity, both dichotomized) and smoking status.

Interaction terms: interaction terms between age and all the time-fixed variables (sex, education, height) will be included to evaluate the association between these time-fixed variables with the trajectory of the outcome; the main interest will be in the interpretation of the interaction terms between sex and functions of age.

Nonlinear functional forms for continuous variables: will be assessed using linear, quadratic, and cubic polynomials.

Model: linear mixed model. The trajectory over time of the outcome is explained at the population level using fixed effects and individual-specific deviations from the population trajectory are captured using random effects to account for the intra-individual serial correlation. The model accommodates individual-specific times of outcome measurements.

Assumptions about missing data mechanism: the linear mixed model, estimated by maximum likelihood, assumes that the data are missing at random (more details are given later). Missing data at variable/item level (for the time-fixed explanatory variables) can be handled either by considering complete cases or by performing multiple imputation.

We will used data from the SHARE study that are publicly available upon registration for use for research purposes. All analyses will be carried out using R statistical language .

4.1.2 Data retrieval and data management

After registration, SHARE data from release 7.0.1 were downloaded from the SHARE data center. Only a selected subset of the thousands of available variables were imported in the R program for analysis, using the general purpose functions in R language for SHARE data export and steps for data cleaning that were described by Lusa and Huebner [LusaHuebner]; data import and initial data manipulation used the proposed approach, and is described in the file ImportDataGS_Denmark_longitudinal_v3.html, where details are provided.

In brief, the exported variables included the outcome (variable summarized by SHARE and individual measurements and reason for missing values), explanatory variables (sex, education, height, time varying explanatory variables are: weight, smoking status, vigorous physical activity, moderate physical activity), and some additional variables: wave, date of interview, date of birth, type of questionnaire, baseline wave.

Additional information about participation in the study was retrieved from the cv module, which summarizes the participation in the study of the individuals that entered the study (and information about death), and in the xt module which includes the information from the exit questionnaire (after death of the participants).

Datasets in long and in wide format (by wave and measurement occasion, defined as the number of waves since first available measurement +1) were generated.

References:

Lusa, L. Huebner, M. Organizing and Analyzing Data from the SHARE Study with an Application to Age and Sex Differences in Depressive Symptoms. Int. J. Environ. Res. Public Health 2021, 18, 9684. Link

4.1.3 Data cleaning

Data cleaning involved the removal of implausible values (for example, very low/high weights and heights). Details are given in the supporting file ImportDataGS_Denmark_longitudinal_v3.html.

4.1.4 Data dictionary

Metadata and extensive documentation for the SHARE data is available on the web site of the project Link.

Here we report the main characteristics of the variables that will be used in this project

knitr::kable(descr.var,format="html", caption="Variables exported") %>% kable_styling()
Variables exported
variable label value unit SHARE.name explanation
mergeid unique identifier of participant format: DK-012345-01 NA mergeid Unique identifier for the subjects, remains the same in all waves.
firstwave first wave where the person was included “Wave 1”, “Wave 2” ,…, “Wave7” NA firstwave Wave where the participant was first included in the study
wave Wave “Wave 1”, “Wave 2” ,…, “Wave7” NA wave Wave when the interview was carried out
int_year year of interview 2004 to 2017 years int_year Calendar time of interview, year
int_month month of interview “January”, “February” , … , “December” NA int_month Calendar time of interview, month
country Country of residence Denmark NA country Country where the interview was carried out, limited to Denmark in our analysis
maxgrip Maximum grip strength 0-100 kg maxgrip " Valid measurements were defined as the values of two measurements in one hand that differed by less than 20 kg. GS measurements with values =0 kg or C100 kg were excluded as well as if GS was only measured once in one hand. The maximum value (MaxGS) was defined as the maximum GS measurement of both hands (2 x 2) or of one hand (2 x 1). " [Andersen-Ranberg, K., Petersen, I., Frederiksen, H., Mackenbach, J. P., & Christensen, K. (2009). Cross-national differences in grip strength among 50+ year-old Europeans: results from the SHARE study. European Journal of Ageing, 6(3), 227-236.]
maxgrip_miss reason for missing values of maxgrip “Refusal”, “Don’t know”, “R agrees to take measurement”, “R refuses to take measurement”. “R is unable to take measurement” , “Proxy-interview” NA gs_001 Gs_001 (refulsal,agrees, unable to make meausrement) and gs_002: use of hands (both, right only, left only, cannot use both hands)
age_int Age 50+ years age_int age at the time of interview is calculated on the basis of the respondent’s month and year of birth (dn002_mod and dn003_mod) and month and year of interview (int_year and int_month). The month-exact age is divided by 12 to produce age in years. They imputed missing values for all the variables except for year of birth
gender Sex 0 = male; 1 =female NA DN042_Gender sex of the participant
pa_vig vigrous physical activity 1= more than once a week; 2= once a week; 3= one to three times a month; 4= hardly ever, or never; Refusal; Don’t know NA br015_ br015_ gives information on the frequency of doing vigorous activities such as sports, heavy housework, or a job that involves physical labor.
pa_low low or moderate physical activity 1= more than once a week; 2= once a week; 3= one to three times a month; 4= hardly ever, or never; Refusal; Don’t know NA br016_ How often do you engage in activities that require a low or moderate level of energy such as gardening, cleaning the car, or doing a walk?
height Height 105-210 cm ph013_ Height (ph013_) is asked only in the baseline interview up to w4 - included. Must be imputed for longitudinal interviews. In w5 and w6 it is asked directly to everybody, in w7 again just in the baseline interview. Negative values are codes for missing values
weight Weight 40-220 kg ph012_ negative values are codes for missing values
bmi Body Mass Index BMI = (ph012_/(ph013)2)10 000. NA bmi BMI = (ph012_/(ph013)2)10 000, the value is derived from SHARE
cusmokenoPP current smoking at baseline “Yes, currently smoke” ,“Never smoked daily for at least one year”, “No, I have stopped”, “Refusal”, “Don’t know” , NA cusmoke Current/ever smoking is available only at baseline interviews from w6 on. Therefore, most of values for w6 (and7) are missing by design
isced1997_r ISCED-97 code 1 , ISCED-97 code 2 …. ISCED-97 code 6, Other, Refusal, Don’t know, None NA NA isced1997_r Highest education level achieved, according to the ISCED 1997 categorization
dead NA 1=yes, 0=no NA XT008_MonthDied, XT009_YearDied Xt.all includes Mergeid, gender, country, year of birth (yrbirth_xt), month and year of death (xt008_ xt009_), age at death (xt010_), main reason of death (xt011_) and wave in which death was recorded with the interview

The variables suffixed with PP (pre-preprocessing) indicate a variable that was further preprocessed when data were extracted, the variables that were derived with further preprocessing are included in the list of the Derived variables below.

knitr::kable(derived.var,format="html", caption="Derived variables") %>% kable_styling()
Derived variables
variable label value Explanation type
age_int_cat Age category 50-59, 60-69, 70-79, 80+ Based on the age variable Factor
cusmoke current smoking Yes, No Variable derived by us combining the responses of br002_ and br001_ variables. It retrieves many values that were set to missing by the cusmoke variable provided by SHARE Factor
pa_vig_freq vigrous physical activity more than once a week; binary 1= more than once a week; 0: once a week; one to three times a month; hardly ever, or never; NA=Refusal; Don’t know Derived from pa_vig Factor
pa_low_freq low or moderate physical activity more than once a week; binary 1= more than once a week; 2= once a week; 3= one to three times a month; 4= hardly ever, or never; Refusal; Don’t know Derived from pa_low Factor
int_date date of interview (assuming it took place on the 15th of the month, as day is not reported) date Based on int_month and int_year date
education Education in three categories Low =“None”| “ISCED-97 code 1” | ISCED-97 code 2" ; Medium = “ISCED-97 code 3” | “ISCED-97 code 4”; High = “ISCED-97 code 5” | “ISCED-97 code 6”; NA = Other | Refusal Based on isced1997_r, grouping in three levels Factor
death_date Date at death date Date of death, assuming it occurred on the 15th of the month date

The variables in the list below include additional information about the maxgrip variable (individual measurements, reasons for missingness, etc).

knitr::kable(maxgrip.var,format="html", caption="Additional information about maxgrip") %>% kable_styling()
Additional information about maxgrip
variable label value unit SHARE.name explanation type
gs006_ GS1, left hand 0-100 kg gs006_ Grip strength, first measurement of left hand Numeric
gs007_ GS2, left hand 0-100 kg gs007_ Grip strength, second measurement of left hand Numeric
gs008_ GS1, right hand 0-100 kg gs008_ Grip strength, first measurement of right hand Numeric
gs009_ GS2, right hand 0-100 kg gs009_ Grip strength, second measurement of rjght hand Numeric
gs001_ GS available Refusal, Don’t know , R agrees to take measurement, R refuses to take measurement. R is unable to take measurement , Proxy-interview NA gs001_ Availabe/Missing GS and Reason for missing value of GS Factor
gs002_ GS use of hands Refusal , Don’t know, Respondent has the use of both hands , Respondent is unable to use right hand, Respondent is unable to use left hand, Respondent is unable to use either hand NA gs002_ use of hands Factor

The codes used in the original data for missing values are reported below.

knitr::kable(missing.values,format="html", caption="Codes used for missing values") %>% kable_styling()
Codes used for missing values
A:.general.missing.codes Code
-1 “Don’t know”
-2 “Refusal”
-3 “Implausible value/suspected wrong”
-4 “Not codable”
-5 “Not answered”
-6 “Not yet coded”
-9 “Not applicable”
B: not applicable – specified Code
-91 “Not applicable (not yet part of sample)”
-92 “Not applicable (no participation in this wave)”
-93 “Not applicable (respondent was not listed as household member in this wave)”
-94 “Not applicable (respondent has an End-of-Life interview in previous waves)”
-95 “Not applicable (no main interview done)”
-98 “Not applicable (other reason)”
-99 “Not applicable (missing by design)”

4.1.5 Output of data import, management and cleaning

The data used for IDA included a file in long format (one row per measurement), two wide-format files (by wave and by measurement occasion), and files that facilitate the retrieval of the participation profile and of the information about deaths.The details are described in the supplementary file ImportDataGS_Denmark_longitudinal_v3.html

4.1.6 Domain expertise

Structural variables are: sex, wave and type of interview (baseline vs. longitudinal), and grouped age. In some explorations baseline wave (baseline wave) rather than wave can be used as structural variable; also measurement occasion can be used as an alternative to wave, as appropriate and most informative for the IDA analyses.

4.2 IDA planned analyses

Topic (Item) Features Purpose
Participation Profile
Time frame (P1) (i) summarize the times when interviews were taken by calendar time, stratifying by Wave; (ii) evaluate the time between interviews carried out in successive waves; (iii) evaluate the number of baseline and longitudinal/SHARELIFE interviews by wave; (iv) summarize the use of baseline and longitudinal questionnaires across waves and measurement occasions (i) explore when the study was carried out and how the measurements were distributed across waves, identify possible problems in how data were collected within waves (gaps, time-lags, bimodal distributions in interview dates, …); (ii) evaluate if the time between successive measurements was similar across waves and the variability of the time differences; (iii) explore the temporal/by wave distribution of the baseline interviews vs longitudinal interviews; (iv) understand the design of the study, identify possible errors in the implementation of the study (incorrect use of questionnaires).
Time metric (P2) The analysis strategy (AS) defines age as the time metric in the model (different than in P1). Describe the distribution of age, (i) overall, stratified by (ii) wave, (iii) by type of interview (baseline/longitudinal) (i) Understand the distribution of age and its consistency with the expectations; (ii) and (iii) understand the characteristics of the refreshment samples and check if the collected data are consistent with the sampling design.
Participants (P3) Provide the number of participants who attended the assessment using measurement occasion as time metric. To evaluate the overall sample size and the amount of longitudinal information available for each participant, which can influence the complexity of the modeling of the trajectories at individual level.
IDA screening domain: Missing data
Non-enrollment (M1) Describe the participants that were selected but did not participate in the study - only proportions are available from meta-data, the characteristics are not reported in the data. Understand the extent and characteristics of the participants that decide not to participate.
Drop-out and intermittent missingness (M2/M3) (i) report the number of participants at baseline and at each successive measurement occasion, stratifying by baseline wave (tables and ribbon plots to visualize the loss of participants) (ii) stratify the number of participants at each measurement occasion by reason of missingness, using the categories: death, out- of-sample, intermittent missingness, missing/definitive missing, administrative sampling, out of household. Out-of-sample is defined as removed from the study because of three missing interviews, as defined in the study protocol, the definition is applied retrospectively at the first missing interview; missing/definitive missing participants have last possible interview missing but were not removed from the study (and can potentially be interviewed in a future wave); (iii) compare the baseline characteristics of participants classified by (later) type of missingness (complete responders, deaths, intermittent missingness, lost to follow-up (out-of-sample or missing), out of household) using descriptive statistics. (iv) evaluate the number/proportion of participants for which the vital status is unknown, and check the consistency of the information included in the data set (i.e., reported status vs date of death, other possible aspects identified during data exploration). (i) To understand the attrition magnitude and mechanism; evaluate the amount of available information for participants; (ii) to evaluate the amount of deaths during follow-up; highlighting how/when missing values occur during follow-up, differentiating them by the cause of missingness; (iii) to identify the baseline characteristics of participants with different type of missingness later during the study; (iv) evaluate the quality of data.
Variable (item) missingness (M4) Restrict the attention to the participants that have a valid measurement (as unit missingness was addressed in M2/M3) and on the variables included as predictors in the AS and the outcome; (i) provide the number and proportion of missing values for the outcome and the explanatory variables (weight, smoking status, vigorous physical activity, moderate physical activity, sex, height, education) at baseline, using the data from the available interviews, overall and stratified by sex, wave and age group. (ii) Describe missingness also during the longitudinal follow-up for the time varying explanatory variables, overall and stratifying by type of interview, wave, measurement occasion, and by sex and age group. For the summaries stratified by measurement occasion, remove the observations where the variables are missing by design. (iii) Describe the missingness of the outcome (when the interview is available) during the longitudinal follow-up, stratifying by sex and age group. (iv) Describe the reasons for missing outcome (when the interview is available) To evaluate the amount of missingness and obtain insights about the missing data mechanism; for the explanatory variables: to evaluate if imputation is needed, to assess from data if some variables are missing by design (for specific types of interviews/waves); for the outcome: to evaluate if the assumption of MAR is appropriate.
Patterns (M5) Describe the co-occurrence of missing values (i) for variables at baseline, (ii) of the outcome across measurement occasions, (iii) of each time-varying explanatory variables across measurement occasions To evaluate how common is the co-occurrence of missing values in some variables, and to evaluate if there are patterns of co-occurrence of missingness (of outcome or time-varying variables) across measurement occasions.
Extensions: Missing data
Non-enrollment (ME1) Compare the baseline distribution of age/sex/education of the participants that entered the study with those from the target population (using data on the target population as available from EUROSTAT), stratifying by wave and type of refreshment sample. To evaluate if the responders differ from the target population.
Probability of drop-out (ME2) Estimate the probability of loss to follow-up, death and death after loss to follow-up; stratify by sex only, and by sex and age group. Details: Define loss to follow-up (LTF) as missing data due to: out-of-sample, missingness, out of household if not re-included in the analysis later; the estimates are obtained estimating cumulative incidence functions using Aalen-Johansen estimators for LTF and deaths (defining death times/events only for those that are not lost to follow-up - as if LTF was an absorbing state), and with Kaplan-Meier estimator to estimate the probability of death after loss to follow-up (time of entry=the time of LTF, time of end=death, time of censoring = the end of the study for those who are not dead). Evaluate the probability of loss to follow-up and death during the study and their association with sex and age; these estimates might provide indications about the mechanism leading to missing values and give indications about the appropriateness of the methods used to take into accout the missing values in modelling.
Dropout effect on outcome (ME3) Visualize the mean profiles of grip strength by measurement occasion, (i) stratified by drop-out measurement occasion due to death and (ii) stratified by drop-out measurement occasion due to loss to follow-up. To evaluate if the trajectories of the outcome variable of participants that died (were lost to follow-up) differ from those that survived (were not lost to follow-up), and thus to evaluate the missing data mechanism.
IDA screening domain: Univariate descriptions
Description of the variables at baseline (U1) Summarize the outcome variable and the explanatory variables with numerical summaries (in a table, reporting mean, median, interquartile range, standard deviation for numerical variables, number and proportion for each category for categorical variables) and graphical summaries (using combo plots consisting of high-resolution histograms, boxplots and dotplots for numerical variables). For physical activity variables, which were recorded with four levels and dichotomized by us, provide also the summaries using the original categories. To summarize the distributions of the variables, to compare the summaries with the expectations and identify possible problems in the data (digit preference, values that are present too often, unexpected/unadmissible values, …).
Description of the time-varying variables at later points (U2) Summarize the outcome variable and the time-varying explanatory variables at later time points, using wave as time metric. To summarize the distribution of variables during follow-up. The use of wave as time metric is helpful to identify systematic differences across waves, possibly due to data collection issues.
IDA screening domain: Multivariate descriptions
Association at baseline (V1) Visualize the association of each explanatory variable with (the structural variables of) age at baseline and sex; age can be grouped (10-year intervals) or used as a numerical variable; To evaluate if the expectation about the association between explanatory variables and age and sex are met, obtaining an indication of the strength of the association.
Correlation at baseline (V2) (i) Evaluate the Pearson correlation between all explanatory variables at baseline, present data with numbers and using heatmaps, stratify by sex. To evaluate the strength of correlation between explanatory variables, which might influence the choice of the explanatory variables.
Interactions at baseline, if applicable (V3) (i) Mostly not needed as explored in VE1 and V1; here we (ii) explore the association between age and weight, stratified by physical activity (i) The analysis plan envisions the use of interactions between age and all time fixed explanatory variables (sex, education, height), the main interest will be in the interpretation of the interaction between sex and functions of age. The descriptive statistics stratified of all the explanatory variables stratified by age groups and sex are reported in section VE1 - Stratification, the association between height and age/sex is explored in V1; (ii) to provide information to domain experts about the possible interaction between age and status of vigorous/low intensity activity with respect to weight.
Extensions: Multivariate descriptions
Stratification (VE1) Stratify the univariate descriptions of the data by (i) sex and age group (stratification by wave is presented in the longitudinal aspects) and (ii) by baseline wave (i) To explore the distribution of variables in subgroups of participants and (ii) to evaluate the possible differences between participants included in different waves.
IDA screening domain: Longitudinal aspects
Profiles (L1) (i) Visualize the individual profiles of the outcome for all the participants, using age and measurement occasion as time metrics, and stratifying by sex; (ii) to take into account that the number of participants is large, use selected subgroups of participants (100 per group, stratifying the plots by sex and age groups); (iii) provide also interactive plots, where the individual profiles can be selected. Use colors that reflect the value of the initial grip strength. To evaluate the shape of the profiles and the individual variability; these explorations can guide the choice of the functional forms of age (which is the time metric).
Trends (L2) Describe with tables and graphically the longitudinal trends of the outcome variable, stratifying by sex (i) using wave, (ii) measurement occasion (ii) and age as time metrics. Histograms and boxplots can be used, the numerical summarization is performed as for univariate summaries. To understand the average time changes in the outcome, to evaluate the appropriateness of the functional form of time metric.
Correlation and variability (L3) Using different time metrics (wave, measurement occasion, age), estimate (i) the Pearson’s correlation of the outcome variable within each participants across time points, using complete pairs, and (ii) the variability of the outcome variable across time points; overall and stratified by sex. To get insights on (i) the correlation structure and (ii) the variance structure that will be used in modeling. Using waves we can identify some systematic errors due to wave, while measurement occasion/age is more directly related to the research question (decline of grip strength in time/with age).
Trends of time-varying explanatory variables (L4) Describe numerically or graphically the longitudinal trends of the time-varying variables, using (i) measurement occasion and (ii) age as time metrics, stratifying the analyses by sex. Report the overall characteristics using descriptive statistics at each time point; for physical activities variables focus on individual changes, using Sankey plots. ADD >>>
Extensions: Longitudinal aspects
Cohort/Period effects (LE1) (i) Define birth cohort (grouped in ten year intervals, larger intervals can be used in case of sparse categories) and evaluate its association with age and wave; (ii) summarize possible cohorts on the outcome, and on the explanatory variables (iii) summarize possible cohort effect on exploratory variables. To assess if the variation of the outcome / exploratory variables can occur because of birth cohort effects and to guide the choice to include birth cohort as an explanatory variable in the regression model.

5 Data screening

In this part of the file the comments in blue indicate IDA findings.

# here we upload data from Denmark only
load(here::here("Data", "GSSHAREDenmarkNoExcl_v2.Rdata"))

# load data about weights
load(here::here("Data", "GSSHAREDenmarkWeights.Rdata"))

share1 <- share
sharew1 <- sharew
sharew1.baseline <- share.data.first.interview


# recoding of the typeQuest variable
share1$typeQuest <-
  recode(
    share1$typeQuest,
    "Baseline questionnaire" = 'Baseline questionnaire',
    "Longitudinal questionnaire"  = 'Longitudinal questionnaire',
    Sharelife  = 'SHARELIFE'
  )


#baseline date, added to the long format dataset
share1$int_date_baseline <-
  left_join(share1, sharew1.baseline[, c("mergeid", "int_date")], by = "mergeid")$int_date.y

#age at baseline
share1$age_int_baseline <-
  left_join(share1, sharew1.baseline[, c("mergeid", "age_int")], by = "mergeid")$age_int.y

#time between baseline and current interview, in years
share1$time_since_baseline <-
  (as.numeric(share1$int_date - share1$int_date_baseline) / 365.25)


share1 <-
  mutate(share1, first.wave.cat = factor(paste("Wave", first.wave)))
sharew1 <-
  mutate(sharew1, first.wave.cat = factor(paste("Wave", first.wave)))


# define data frames that contain outcome data and other variables by measurement occasion and by wave
mg.occasion <- select(sharew.mo, starts_with("maxgrip"))
mg.wave <- select(sharew1, starts_with("maxgrip"))

age.occasion <- select(sharew.mo, starts_with("age_"))
weight.occasion <- select(sharew.mo, starts_with("weight"))

num.waves <- 7

baseline.wave.col <- sharew1$first.wave

#preparation of the datasets where the suspicious values are flagged
sharew1_withflags <- sharew1
share1_withflags <- share1

5.1 Participation profile

5.1.1 Time frame of the study (P1)

Here we summarize the times when interviews were taken (by calendar time or Wave).

5.1.1.1 Distribution of the dates of the interviews (by wave)

The graph below shows the distribution of the dates where the interviews were carried out, stratified by Wave.

g <-
  ggplot(share1, aes(int_date, fill = Wave)) + geom_histogram(binwidth = 10) + theme_bw() +
  labs(x = "Interview date", y = "Number of interviews") + scale_x_date(date_breaks = "2 years",  date_labels = "%Y") +   guides(fill = guide_legend(nrow = 1)) +
  theme(legend.position = 'bottom', legend.direction = "horizontal")

g

# the size of axes/labels is increased for the plots that are included in the paper

g <-
  g + theme(
    text = element_text(size = rel(5)),
    strip.text.x = element_text(size = rel(5)),
    strip.text.y = element_text(size = rel(5)),
    legend.text = element_text (size = 15)
  )

ggsave(
  "Figures/waveTimeHistAll.png",
  scale = 1.5,
  height = 4,
  width = 10,
  plot = g
)


### Figure 1 for paper #####
ggsave(
  "FiguresSubmission/Fig1.tif",
  scale = 1.5,
  height = 4,
  width = 10,
  plot = g,
  compression = "lzw",
  device = "tiff",
  dpi = 600
)
##############################


ggsave(
  "Figures/waveTimeHistAllRescaled.png",
  scale = 1.5,
  height = 4,
  width = 4,
  plot = g +
    theme(
      legend.position = 'bottom',
      legend.direction = "horizontal",
      # added to increase the size of the axes/labels
      text = element_text(size = rel(4)),
      strip.text.x = element_text(size = rel(4)),
      strip.text.y = element_text(size = rel(4)),
      legend.text = element_text (size = 10)
    )
)

5.1.1.2 Time range for each Wave (baseline and longitudinal interviews)

The wave with most interviews was Wave 5. The distribution of the number of interviews per Wave is shown below, with the range of dates for the interviews performed in different waves.

#variables defined in a chunk at the beginning of the file
#share1$int_date <- lubridate::make_date(year=share1$int_year, month = share1$int_month, day=15)
#mg <-  sharew1[ ,c("maxgrip", "maxgrip.2", "maxgrip.4", "maxgrip.5", "maxgrip.6")]
#wavepart <- apply(mg, 1, function(x) paste0(c(1,2,4,5,6)[!is.na(x)], collapse=""))



tmp.df  <-
  share1 %>% group_by(Wave) %>% dplyr::summarise(
    num.noNA = sum(!is.na(int_date)),
    me = median(int_date, na.rm = TRUE),
    min = min(int_date, na.rm = TRUE),
    max = max(int_date, na.rm = TRUE)
  )

tmp.df$Proportion <- tmp.df$num.noNA / sum(tmp.df$num.noNA)

tmp.df <- tmp.df[, c(1, 2, 6, 4, 5)]
dimnames(tmp.df)[[2]] <-
  c("Wave" ,
    "Number of interviews",
    "Proportion",
    "Begin (date)",
    "End (date)")



knitr::kable(tmp.df, digits = 2) %>% kable_styling()
Wave Number of interviews Proportion Begin (date) End (date)
Wave 1 1596 0.09 2004-04-15 2004-11-15
Wave 2 2487 0.13 2006-11-15 2007-08-15
Wave 3 1979 0.11 2008-11-15 2009-08-15
Wave 4 2112 0.11 2011-02-15 2011-08-15
Wave 5 3919 0.21 2013-02-15 2013-11-15
Wave 6 3514 0.19 2015-02-15 2015-11-15
Wave 7 3025 0.16 2017-03-15 2017-10-15

We summarized the time between interviews conducted in specific Waves also at individual level (graphically and with summary statistics). NA’s indicate individuals where at least one of the two interviews was missing. Data were summarized in years, (months were used for SD).

The time lag between waves was approximately 2 years, with a slightly longer gap between Wave 1 and 2, and Wave 3 and 4. The shorter time lag was between Wave 2 and 3, and Wave 5 and 6. The variability was highest between wave 5 and 6.

tmp.year <- select(sharew1, starts_with("int_year"))
tmp.month <- select(sharew1, starts_with("int_month"))

int_date.w <- matrix(NA, ncol=num.waves, nrow=nrow(sharew1))

for(i in 1:num.waves){
  int_date.w[,i] <- lubridate::make_date(year=tmp.year[,i], month = tmp.month[,i], day=15)
}


int_date.mo <- f.waveToMO(int_date.w, num.waves=7, num.obs.w1=nrow(sharew1.baseline), my.baseline.wave.col=sharew1$first.wave)


boxplot(t(apply(int_date.w, 1, function(x) diff(x)))/365.25, xlab = "", ylab="Time difference between \ninterviews in successive waves (years)", names=paste("W", 2:7, "- W", 1:6), xaxt = "n")
axis(1, las =2, labels= FALSE )

#to rotate the x-axis labels by 35 degrees
text(x = 1:6,
     ## Move labels to just below bottom of chart.
     y = par("usr")[3] - 0.15,
     ## Use names from the data list.
     labels = paste("W", 2:7, "- W", 1:6),
     ## Change the clipping region.
     xpd = NA,
     ## Rotate the labels by 35 degrees.
     srt = 35,
     ## Adjust the labels to almost 100% right-justified.
     adj = 0.965,
     ## Increase label size.
     cex = 1.1)

abline(h=seq(1, 3, by=.5), lty=2, col="gray")

# checkRemove - rounding in the output, bug in R of the summary function
# kable(summary(t(apply(int_date.w, 1, function(x) diff(x)))/365.25), col.names=paste("Wave", 2:7, "- Wave", 1:6), digits=2)  %>% kable_styling()
 
# the summary is assembled manually to avoid the problem with the number of displayed digits, seems a bug in R
tmp <- apply(t(apply(int_date.w, 1, function(x) diff(x)))/365.25, 2, function(x) c(summary(x[!is.na(x)]),  sum(is.na(x))))
dimnames(tmp)[[1]][7] <- "NA's"
kable(tmp, digits=2)  %>% kable_styling()
Min. 2.17 1.25 1.75 1.67 1.25 1.33
1st Qu. 2.42 1.67 2.16 1.92 1.66 1.92
Median 2.50 1.83 2.25 2.08 1.84 2.00
Mean 2.53 1.83 2.22 2.07 1.84 2.01
3rd Qu. 2.66 2.00 2.33 2.17 2.00 2.16
Max. 3.25 2.50 2.66 2.67 2.67 2.66
NA’s 4267.00 3477.00 3864.00 3632.00 2266.00 2563.00

Standard deviations of the time differences, in months.

kable(t(apply(t(apply(int_date.w, 1, function(x) diff(x)))/365.25, 2, sd, na.rm=TRUE)*12), 
      col.names=paste("SD Wave", 2:7, "- Wave", 1:6),
      digits=1, caption = "Standard deviation of time difference between measurements, in months") %>% kable_styling()
Standard deviation of time difference between measurements, in months
SD Wave 2 - Wave 1 SD Wave 3 - Wave 2 SD Wave 4 - Wave 3 SD Wave 5 - Wave 4 SD Wave 6 - Wave 5 SD Wave 7 - Wave 6
2 2.2 1.7 1.9 3.3 2.4

5.1.1.3 Distribution of first (baseline) interviews and longitudinal interviews by wave/calendar time

The participants were followed up longitudinally, and refreshment samples (new participants) were drawn during the study, as planned. The table below shows that no new participants were included in Wave 3 and 7 (SHARELIFE interviews), and that the largest refreshment samples were included in Wave 2 and 5 (planned full range refreshment samples, while Wave 4 and 6 planned the refreshment sample of the youngest cohort only).

 my.tab <- table(share1$Wave,  as.character(share1$Wave)!=as.character(share1$firstwave))
 dimnames(my.tab)[[2]] <- c("Baseline", "Longitudinal/SHARELIFE")
kable(my.tab, caption = "Number of interviews by wave, baseline or longitudinal follow-up")  %>% kable_styling()
Number of interviews by wave, baseline or longitudinal follow-up
Baseline Longitudinal/SHARELIFE
Wave 1 1596 0
Wave 2 1266 1221
Wave 3 0 1979
Wave 4 408 1704
Wave 5 1872 2047
Wave 6 228 3286
Wave 7 0 3025

Results are presented also graphically using calendar time as time metric, where it can be seen that in the waves where both types of interview were carried out, data from longitudinal interviews were generally collected earlier than those from baseline interviews.

g <- ggplot(share1 %>% mutate(type = factor(ifelse(typeQuest =="Baseline questionnaire", "Baseline questionnaire", "Longitudinal/SHARELIFE")))  %>% filter(!is.na(type)), aes(int_date, fill=Wave))+geom_histogram(binwidth = 10)+ theme_bw()+labs(x="Interview date", y="Number of interviews")+ scale_x_date(date_breaks = "2 years",  date_labels = "%Y") +   guides(fill = guide_legend(nrow = 1)) + 
  theme(legend.position = 'bottom', legend.direction = "horizontal") + facet_grid(type~.)

         # added to increase the size of the axes/labels
         #text = element_text(size=rel(5)), strip.text.x = element_text(size=rel(5)), 
         #strip.text.y = element_text(size=rel(5)), legend.text = element_text (size=15)) 
g

More details about the refreshment samples are given in the following sections.

5.1.1.3.1 Additional explorations on the type of questionnaries used

By design, the baseline questionnaire is used for the first interview, the longitudinal questionnaire for the follow-up interviews. Here we check if the design was followed.

The baseline questionnaire was used for most of the first interviews, and the longitudinal for follow-up interviews, but some exceptions are observed. (Here we use the variable typeQuest that was defined in the data import step.)

# (the value of the variable `mn101_` is set to missing for all interviews in wave 3 and for some interviews in later waves)

#defined in import files 
#share1$typeQuest <- as.character(share1$mn101_)
#share1$typeQuest[which(share1$Wave=="Wave 3" | share1$mn103_=="Yes")] <- "Sharelife" -> transformed to SHARELIFE


kable(addmargins(table(share1$Wave,share1$typeQuest,  exclude=NULL)), caption = "Number of interviews per type of questionnaire and Wave") %>% kable_styling()
Number of interviews per type of questionnaire and Wave
Baseline questionnaire Longitudinal questionnaire SHARELIFE NA Sum
Wave 1 1596 0 0 0 1596
Wave 2 1313 1174 0 0 2487
Wave 3 0 0 1979 0 1979
Wave 4 416 1696 0 0 2112
Wave 5 1892 2027 0 0 3919
Wave 6 265 3247 0 2 3514
Wave 7 1 1189 1835 0 3025
Sum 5483 9333 3814 2 18632
kable(table(share1$typeQuest, share1$measurement_occasion, exclude=NULL), caption = "Number of interviews per type of questionniare and measurement occasion", col.names = paste0("M", 1:7)) %>% kable_styling()
Number of interviews per type of questionniare and measurement occasion
M1 M2 M3 M4 M5 M6 M7
Baseline questionnaire 5450 17 3 4 5 4 0
Longitudinal questionnaire 2 2990 1128 1714 1575 1358 566
SHARELIFE 0 1203 2232 281 0 32 66
NA 0 1 0 0 1 0 0

Overall, the deviations from design in the use of questionnaries were small. The baseline questionnaire was used more than once for some participants (n=33), a longitudinal questionnarie was used at first measurement for 2 participants, the questionnaire type was unknown for 2 participants. In Wave 7 baseline questionnaire was used for 1 participant (by design it should not have been used).

5.1.2 Time metric (P2)

The analysis strategy defines age as the time metric in the model, which is described here.

5.1.2.1 Distribution of age

The inclusion criteria specified that age at baseline interview was at least 50. The sampling design is briefly described in the description of the data.

The distribution of the age of the participants, stratified by Wave (overall, and by baseline or longitudinal interview) is presented graphically.

The overall distribution of age across waves differed somehow, as did the distribution in the baseline and longitudinal interviews, due to the sampling design. The small group of participants first included in Wave 4 and 6 were, by design, considerably younger than those included in other waves. In Wave 3 and 7, where no refreshment sample was used, it was expected that the distribution of age would be shifted and reflect the 52+ population, rather than the 50+. Overall, the distribution of age across waves and types of interviews is consistent with the expectations based on the sampling design.

# summaries are derived dividing interviews in first or longitudinal/SHARELIFE 
# note that we do not use the type of questionnaires but the order of the interview / small differences were observed

p <- ggplot(
  rbind.data.frame(share1, share1) %>%
    mutate(
      type = ifelse(
        first.wave==Wave_num,
        "First interview",
        "Longitudinal/SHARELIFE"
      )
    ) %>% mutate(type = ifelse(c(1:(
      2 * nrow(share1)
    )) <= nrow(share1), "All", type)) %>% filter(!is.na(type))
  
  ,
  aes(age_int)
) + geom_histogram(binwidth = 1,
                   fill = "white",
                   color = "black") +
  facet_grid(Wave ~ type) +
  labs(x = "Age at interview", y = "Frequency") +
  theme_bw()


df <- rbind.data.frame(share1, share1) %>% mutate(
  type = ifelse(
        first.wave==Wave_num,
        "First interview",
        "Longitudinal/SHARELIFE"
      )
  ) %>% mutate(type = ifelse(c(1:(2 * nrow(
    share1
  ))) <= nrow(share1), "All", type)) %>% filter(!is.na(type))

dat_text <- data.frame(
  label = paste("n=", c(table(df$Wave, df$type))),
  Wave   = rep(
    c("Wave 1", "Wave 2", "Wave 3", "Wave 4", "Wave 5",
      "Wave 6", "Wave 7"),
    3
  ),
  type = rep(
    c("All", "First interview",
      "Longitudinal/SHARELIFE"),
    each = 7
  )
)

p + geom_text(
  data    = dat_text,
  mapping = aes(x = Inf, y = Inf, label = label),
  hjust   = 1.2,
  vjust   = 1.2,
  size = 3
)

ggsave(
  "Figures/AgeTimeMetricWaveV3.png",
  scale = 1.5,
  height = 3,
  width = 5
)

#### Figure 3 for paper

ggsave(
  "FiguresSubmission/Fig3.tif",
  scale = 1.5,
  height = 3,
  width = 5,
  device = "tiff",
  compression = "lzw",
  dpi = 600
)

#####################

The distribution of ages by wave, stratified by baseline wave is shown in the figure below, which presents graphically the aging of the wave cohorts.

ggplot(share1 %>% filter(!is.na(typeQuest)) %>% mutate(type = ifelse(typeQuest =="Baseline questionnaire", "Baseline questionnaire", "Longitudinal/SHARELIFE")), aes(age_int)) + geom_histogram(binwidth = 1, fill= "white", color="black") + facet_grid(paste("Baseline \nWave",first.wave)~Wave) + labs(x="Age at interview", y = "Frequency") + theme_bw()

The tables below present the summary statistics for age of the observed participants, overall and by sex.

Average age somehow increased at later waves for both sexes , a similar increase in the average age is observed also in the population (data not shown).

my.mat <- matrix(unlist(tapply(share1$age_int, share1$Wave, summary)), ncol=6, byrow=TRUE)
dimnames(my.mat)[[1]] <- paste("Wave", 1:7)
dimnames(my.mat)[[2]] <-c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age at interview across waves, overall") %>% kable_styling()
Distribution of age at interview across waves, overall
Min. 1st Qu. Median Mean 3rd Qu. Max.
Wave 1 50 56 62 64.4 72 97
Wave 2 50 56 63 64.5 72 99
Wave 3 51 58 64 65.8 73 97
Wave 4 50 57 64 65.1 72 99
Wave 5 50 57 64 65.4 72 100
Wave 6 50 58 65 65.8 72 100
Wave 7 52 60 66 67.2 73 101
my.mat.f <-  matrix(unlist(tapply(share1$age_int[share1$gender=="Female"], share1$Wave[share1$gender=="Female"], summary)), ncol=6, byrow=TRUE)
my.mat.m <-  matrix(unlist(tapply(share1$age_int[share1$gender!="Female"], share1$Wave[share1$gender!="Female"], summary)), ncol=6, byrow=TRUE)
dimnames(my.mat.f)[[1]] <-  dimnames(my.mat.m)[[1]] <-paste("Wave", 1:7)
dimnames(my.mat.f)[[2]] <- dimnames(my.mat.m)[[2]] <- c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat.f, digits = 1, caption = "Females: distribution of age at interview across waves") %>% kable_styling()
Females: distribution of age at interview across waves
Min. 1st Qu. Median Mean 3rd Qu. Max.
Wave 1 50 56 63 65.3 74 97
Wave 2 50 56 63 65.1 73 99
Wave 3 51 58 64 66.3 74 97
Wave 4 50 57 64 65.6 73 99
Wave 5 50 57 64 65.5 72 100
Wave 6 50 58 65 65.9 72 98
Wave 7 52 60 66 67.5 74 101
kable(my.mat.m, digits = 1, caption = "Males: distribution of age at interview across waves") %>% kable_styling()
Males: distribution of age at interview across waves
Min. 1st Qu. Median Mean 3rd Qu. Max.
Wave 1 50 55 61 63.4 70.8 94
Wave 2 50 56 62 63.9 70.0 92
Wave 3 51 58 64 65.2 71.0 94
Wave 4 50 57 63 64.5 71.0 96
Wave 5 50 57 64 65.3 72.0 98
Wave 6 50 58 65 65.6 72.0 100
Wave 7 52 60 66 66.9 73.0 98

5.1.3 Participants (P3)

5.1.3.1 Number of participants

Overall, 5452 unique participants were included in the data set, the number of measurements (interviews) was 18632. Denmark participated in all waves of the study.

5.1.3.2 Number of interviews for each participant

Most participants were interviewed 3 times (28%), the number of participants interviewed 1 or 2 times was very similar (16/17 %), the number of interviews ranged from 1 to 7, only 23% of subjects were interviewed 6 or 7 times; the distribution of the number of interviews is given in the table below and shown graphically.

tmp <- cbind.data.frame(table(table(share1$mergeid)), prop.table(table(table(share1$mergeid))))[,-3]
dimnames(tmp)[[2]] <- c("Number of interviews", "Frequency", "Proportion")
knitr::kable(tmp, caption="Number of interviews per participant", digits=2) %>% kable_styling(full_width = FALSE)
Number of interviews per participant
Number of interviews Frequency Proportion
1 965 0.18
2 966 0.18
3 1508 0.28
4 527 0.10
5 307 0.06
6 685 0.13
7 494 0.09
barplot(prop.table(table(table(share1$mergeid))) , ylab="Proportion", xlab="Number of interviews per subject")

knitr::kable(tmp, caption="Number of interviews per participant", digits=2, format = "latex")

5.2 Missing values

5.2.1 Non-enrollment (M1)

Here the aim is to describe the non-enrolled, participants that were selected but did not participate in the study, and the reasons, if available. The SHARE study documentation does not provide a detailed description of the selected sample.

The documentation published by the study reports that response rates were 63% in Wave 1/2, 80% in Wave 3, 50% in Wave 4, 60% in Wave 5, 47% in Wave 6 and 85% in Wave 7. It was reported that in Wave 1 the response rates were very similar for both sexes and across age groups.

We indirectly compare the responders to their target population in ME1, using publicly available data.

5.2.2 Drop-out (M2) and intermittent missingness (M3)

Here we describe the number and characteristics of participants who dropped out from the study during the follow-up (loss to follow-up and other possible reasons: death, withdrawal, missing by design, if applicable). We also describe participants with intermittent missingness (participants that have missing data for some of the measurements occasions, but do not drop out of the study). The summaries are based on the participants that had at least one valid interview (unit missingness other than due to non-enrollment).

5.2.2.1 Summaries of missing interviews based on wave as time metric

The follow-up of the subjects (number of interviews by Wave and proportion), stratified by baseline Wave, is shown in the table below and graphically.

The most dramatic descrease in number or participants is observed in the second wave after inclusion. Only 40% of the participants included in Wave 1 had a valid interview in Wave 7, 50% for those included in Wave 2.

# number of interviews
tab1 <-
  with(share1, table(baseline = first.wave.cat, measurement = Wave))

# proportion of interviews
tab2 <-
  round(sweep(with(
    share1, table(baseline = first.wave.cat, measurement = Wave)
  ), 1, table(sharew1.baseline$Wave), "/"), 2)

#setting to missing all the elements below the diagonal
delta <- row(tab1) - col(tab1)
tab1[delta > 0] <- NA
tab2[delta > 0] <- NA
#additional manual setting
tab1[3, 3] <-
  tab2[3, 3] <-
  tab1[4, 4] <- tab2[4, 4] <- tab1[5, 5] <- tab2[5, 5] <- NA


# arranging in a single table (n, proportion), with proper formatting of the rows (for printing 0 digits,2 digits)

num.elements <- nrow(tab1)
df <- rbind(tab1, tab2)
# rearranging the rows (n, proportion)
df <-
  df[rep(c(1, num.elements + 1), num.elements) + rep(seq(0, num.elements -
                                                           1), each = 2),]
# formatting
df <- digitsByRows(data.frame(df), rep(c(0, 2), num.elements))

# using the same names as the table
dimnames(df)[[1]] <-
  paste(rep(dimnames(tab1)[[1]], each = 2), rep(c("(n)", "(prop)"), num.elements))
dimnames(df)[[2]] <- dimnames(tab1)[[2]]


opts <- options(knitr.kable.NA = "")

df[df == "NA" | df == " NA"] <- NA


kable(df,
      caption = "Number (n) and proportion (prop) of interviews by baseline wave") %>% kable_styling()
Number (n) and proportion (prop) of interviews by baseline wave
Wave 1 Wave 2 Wave 3 Wave 4 Wave 5 Wave 6 Wave 7
Wave 1 (n) 1596 1185 984 875 842 738 632
Wave 1 (prop) 1.00 0.74 0.62 0.55 0.53 0.46 0.40
Wave 2 (n) 1302 995 823 843 739 656
Wave 2 (prop) 1.00 0.76 0.63 0.65 0.57 0.50
Wave 4 (n) 414 351 308 281
Wave 4 (prop) 1.00 0.85 0.74 0.68
Wave 5 (n) 1883 1472 1248
Wave 5 (prop) 1.00 0.78 0.66
Wave 6 (n) 257 208
Wave 6 (prop) 1.00 0.81
#remove all 0, not useful

table.country <-
  with(share1, (table(
    baseline = first.wave.cat, measurement = Wave
  )))

table.country0 <-
  with(share1, (
    table(
      baseline = first.wave.cat,
      measurement = Wave,
      country = country
    )
  ))



table.country2 <- apply(table.country, 2, cumsum)
table.country2[table.country == 0] <- 0


table.country3 <- table.country2
for (i in 2:nrow(table.country3))
  table.country3[i, ] <- table.country2[i - 1, ]
table.country3[1, ] <- 0

table.country3[table.country == 0] <- 0



table.forplot <-
  melt(table.country,
       id.vars = 1,
       measure.vars = c(2:(num.waves + 1)))

table.forplot2 <-
  melt(table.country2,
       id.vars = 1,
       measure.vars = c(2:(num.waves + 1)))

table.forplot3 <-
  melt(table.country3,
       id.vars = 1,
       measure.vars = c(2:(num.waves + 1)))

table.forplot0 <-
  melt(table.country0,
       id.vars = 1,
       measure.vars = c(2:(num.waves + 1)))


tables.forplot <-
  cbind.data.frame(table.forplot, ymax = table.forplot2$value, ymin = table.forplot3$value)

tables.forplot <- subset(tables.forplot, value > 0)


tables.forplot0 <-
  cbind.data.frame(table.forplot0, ymax = table.forplot2$value, ymin = table.forplot3$value)

tables.forplot0 <- subset(tables.forplot0, value > 0)



#ribbon plot
g <-
  ggplot(tables.forplot,
         aes(measurement, ymax, fill = baseline, color = baseline)) + geom_ribbon(aes(
           ymin = ymin,
           ymax = ymax,
           x = measurement,
           group = baseline
         ),
         alpha = 0.2) +
  geom_point(aes(color = baseline)) + labs(x = "Wave",
                                           y = "Number of participants",
                                           title = "Denmark",
                                           fill = "Wave at baseline interview") +  theme_bw() + guides(color = FALSE) + theme(legend.position = "bottom")

g

# adapt the size of the fonts for the figure in the paper
g1 <-
  g + theme(
    text = element_text(size = rel(4)),
    strip.text.x = element_text(size = rel(4)),
    strip.text.y = element_text(size = rel(4)),
    legend.text = element_text (size = 10)
  )

ggsave("Figures/ribbonAll.png", scale = 1.2, plot = g1)

#### Figure 2 for paper ####

ggsave(
  "FiguresSubmission/Fig2.tif",
  scale = 1.2,
  device = "tiff",
  compression = "lzw",
  dpi = 600
)

####

5.2.2.2 Summary of the results about reasons for missing values drop-out

Here we describe the reason for missing values at interview level, summarizing the data by measurement occasion.

Below we show the distribution of the number of available interviews per measurement occasion, categorizing the potential measurements for each participant in each Wave in 7 categories;

  • Interview: the measurement was available.
  • Administrative censoring/No opportunity to measure: the measurement is not taken because the study ended (for example, participants included in wave 6 have only two possible measurement occasions)
  • Death: death was reported in the exit questionnaire (in the graph the dead participants are indicated as dead also in measurement occasions that go beyond the administrative censoring).
  • Out-of-household: not part of the household at the time of interview.
  • Out-of-sample: excluded from the study because of prolonged missingness (participants with non-response in many successive waves are labeled as out-of-sample); a participant is defined as out-of-sample at the first missing interview of the sequence that determines the exclusion from the study - the definitions is applied retrospectively.
  • Missing: unit was missing in the measurement occasion, had no valid interview in later waves, but was not classified as out-of-sample in the study.
  • Intermittent missing: participant was not interviewed in the measurement occasion but an interview at a later wave was obtained.

The vast majority of the subjects were potentially included in the study for at least three measurement occasions. For more than 40% of the subjects the study ended at the forth measurement occasion (many subjects were included in Wave 4 or 5 and therefore cannot have more than 3 valid measurement).

Some participants had intermittent missingness (less than 5% at each measurement occasion), missingness by design because participants were not eligible was very rare (out-of-household, <1%), while administrative censoring and deaths were common, as was the loss to follow-up due to other reasons.

#define a new dataset, where the participants are categorized as out-of-sample when they are first lost to follow-up,
missing.occasion.6_cv_mut <- missing.occasion.6_cv
which.oos <-
  which(apply(missing.occasion.6_cv_mut, 1, function(x)
    any(x == -1000)))

# transform code -12 to -1000 at first occurrence of -12
missing.occasion.6_cv_mut[which.oos, ][missing.occasion.6_cv_mut[which.oos, ] ==
                                         -12] <- -1000
# carry over out of the sample beyond administrative censoring
missing.occasion.6_cv_mut[which.oos, ][is.na(missing.occasion.6_cv_mut[which.oos, ])] <-
  -1000



# carry over deaths beyond administrative censoring
which.death <-
  which(apply(missing.occasion.6_cv_mut, 1, function(x)
    any(x == -100)))
missing.occasion.6_cv_mut[which.death, ][is.na(missing.occasion.6_cv_mut[which.death, ])] <-
  -100

table_patterns.1 <- rbind.data.frame(
  table(
    factor(
      missing.occasion.6_cv_mut[, 1],
      levels = c(-1001,-1000,-100,-12,-11,  1)
    ),
    exclude = NULL,
    useNA = "always"
  ),
  table(
    factor(
      missing.occasion.6_cv_mut[, 2],
      levels = c(-1001,-1000,-100,-12,-11,  1)
    ),
    exclude = NULL,
    useNA = "always"
  ),
  table(
    factor(
      missing.occasion.6_cv_mut[, 3],
      levels = c(-1001,-1000,-100,-12,-11,  1)
    ),
    exclude = NULL,
    useNA = "always"
  ),
  table(
    factor(
      missing.occasion.6_cv_mut[, 4],
      levels = c(-1001,-1000,-100,-12,-11,  1)
    ),
    exclude = NULL,
    useNA = "always"
  ),
  table(
    factor(
      missing.occasion.6_cv_mut[, 5],
      levels = c(-1001,-1000,-100,-12,-11,  1)
    ),
    exclude = NULL,
    useNA = "always"
  ),
  table(
    factor(
      missing.occasion.6_cv_mut[, 6],
      levels = c(-1001,-1000,-100,-12,-11,  1)
    ),
    exclude = NULL,
    useNA = "always"
  ),
  table(
    factor(
      missing.occasion.6_cv_mut[, 7],
      levels = c(-1001,-1000,-100,-12,-11,  1)
    ),
    exclude = NULL,
    useNA = "always"
  )
)[, c(6:1, 7)]

#dimnames(table_patterns)[[2]] <- c("Interview", "Intermittent NA", "Lost to FU", "Death", "NA")
dimnames(table_patterns.1)[[2]] <-
  c(
    "Interview",
    "Intermittent missing",
    "Missing",
    "Death",
    "Out-of-sample",
    "Out-of-household",
    "Administrative censoring"
  )
#dimnames(table_patterns)[[1]] <- paste0("M",1:7)



table_patterns.1$Measurement <- paste0("M", 1:7)
#table_patterns.1 <- table_patterns.1[,c(7,1:6)]
table_patterns.1 <- table_patterns.1[, c(8, 1:7)]

DF1 <- reshape2::melt(table_patterns.1, id.var = "Measurement")


g <-
  ggplot(DF1 %>% mutate(variable = factor(
    variable,
    levels = c(
      "Administrative censoring",
      "Death" ,
      "Out-of-household",
      "Out-of-sample"  ,
      "Missing" ,
      "Intermittent missing",
      "Interview"
    )
  )),
  aes(x = Measurement, y = value, fill = variable)) +
  geom_bar(stat = "identity") + labs(y = "Number of interviews", x =
                                       "Measurement occasion", fill = " ") +  theme_classic() +
  theme(
    legend.position = 'bottom',
    legend.direction = "horizontal",
    # added to increase the size of the axes/labels
    text = element_text(size = rel(4)),
    strip.text.x = element_text(size = rel(4)),
    strip.text.y = element_text(size = rel(4)),
    legend.text = element_text (size = 10)
  ) +
  scale_fill_manual(values = c("gray", "red", "black", "orange", "yellow", "light blue", "blue"))  + guides(fill = guide_legend(reverse =
                                                                                                                                  TRUE))

g <-
  g + theme(
    legend.position = 'bottom',
    legend.direction = "horizontal",
    # added to increase the size of the axes/labels
    text = element_text(size = rel(5)),
    strip.text.x = element_text(size = rel(5)),
    strip.text.y = element_text(size = rel(5)),
    legend.text = element_text (size = 15)
  )

ggsave(
  "Figures/MissingPatternsAug22.png",
  scale = 1.3,
  width = 10,
  height = 5
)


#### figure 5 for paper
ggsave(
  "FiguresSubmission/Fig5.tif",
  scale = 1.3,
  width = 10,
  height = 5,
  device = "tiff",
  compression = "lzw",
  dpi = 600
)
#####


### preparation of the table ############


tab1 <- table_patterns.1[, -1]
tab2 <- cbind(round(table_patterns.1[, -1] / table_patterns.1[1, 2], 2))

num.elements <- nrow(tab1)
df <- rbind(tab1, tab2)
# rearranging the rows (n, proportion)
df <-
  df[rep(c(1, num.elements + 1), num.elements) + rep(seq(0, num.elements -
                                                           1), each = 2),]
# formatting
df <- digitsByRows(data.frame(df), rep(c(0, 2), num.elements))
# removing white spaces
#df[,1] <- gsub(" ", "", df[,1])

# using the same names as the table
dimnames(df)[[1]] <-
  paste(rep(paste0("M", 1:7), each = 2), rep(c("(n)", "(prop)"), num.elements))
dimnames(df)[[2]] <- dimnames(tab1)[[2]]

# more readable version of the table
df <-
  cbind.data.frame(" " = rep(c("(n)", "(prop)"), num.elements), df)

# reordering - putting close the categories that might be grouped
df1 <- df[, c(1, 2, 7, 3, 4, 6, 5, 8)]

kable(df1, row.names = FALSE,
      caption = "Number (n) and proportion (prop, by measurement occasion) of interviews by type of missingness.") %>%
  pack_rows(index = table(fct_inorder(rep(
    paste0("M", 1:7), each = 2
  )))) %>% kable_styling()
Number (n) and proportion (prop, by measurement occasion) of interviews by type of missingness.
Interview Out-of-household Intermittent missing Missing Out-of-sample Death Administrative censoring
M1
5452 0 0 0 0 0 0
(prop) 1.00 0.00 0.00 0.00 0.00 0.00 0.00
M2
4211 6 274 592 207 162 0
(prop) 0.77 0.00 0.05 0.11 0.04 0.03 0.00
M3
3363 14 289 788 335 409 254
(prop) 0.62 0.00 0.05 0.14 0.06 0.08 0.05
M4
1999 11 146 364 380 561 1991
(prop) 0.37 0.00 0.03 0.07 0.07 0.10 0.37
M5
1581 6 73 297 379 726 2390
(prop) 0.29 0.00 0.01 0.05 0.07 0.13 0.44
M6
1394 9 34 352 379 894 2390
(prop) 0.26 0.00 0.01 0.06 0.07 0.16 0.44
M7
632 3 0 209 380 977 3251
(prop) 0.12 0.00 0.00 0.04 0.07 0.18 0.60
5.2.2.2.1 Number of missing interviews excluding deaths

The following tables explore the type of missing interviews, taking into account the number of reported deaths during follow-up, and evaluate the proportion of interviews carried out excluding the subjects that died during follow-up. We evaluate the number of deaths by baseline wave and the proportion of participants that survived through waves.

As expected, very few subjects died at the beginning of the follow-up, most of the deaths involve individuals first included in the first two waves (in Wave 7 only 37% of the individuals included in Wave 1 were reported to be still alive, and 68% among those included in Wave 2, while almost all the individuals included in Wave 4 or later were still alive in Wave 7)

#number of interviews by baseline wave
num_int_baselinewave <- table(sharew1.baseline$Wave)

#table with number of interviews
table_interviews <- with(share1, (table(baseline=first.wave.cat, measurement=Wave)))

#table with the number of deaths 
table_deaths <- cbind(`Wave 1`=0, table(sharew1.baseline$Wave, sharew1.baseline$death_firstwave_cv))
table_deaths_cum <- t(apply(table_deaths, 1, cumsum))

table_alive <- table_interviews - table_deaths_cum


# arranging in a single table
tab1 <- addmargins(table_deaths,2)
tab2 <- cbind(table_deaths/table_interviews, Sum=apply(table_deaths, 1, sum)/table(sharew1.baseline$Wave))
num.elements <- nrow(tab1)          

#setting to missing all the elements below the diagonal
delta <- row(tab1) - col(tab1)
tab1[delta > 0] <- NA
tab2[delta > 0] <- NA
#additional manual setting 
tab1[3,3] <- tab2[3,3] <- tab1[4,4] <- tab2[4,4] <- tab1[5,5] <- tab2[5,5] <- NA

# arranging in a single table (n, proportion), with proper formatting of the rows (for printing 0 digits,2 digits)

df <- rbind(tab1, tab2)
# rearranging the rows (n, proportion)
df <- df[rep(c(1, num.elements+1), num.elements) + rep(seq(0,num.elements-1), each=2), ]
# formatting
df <- digitsByRows(data.frame(df), rep(c(0, 2), num.elements))

# using the same names as the table
dimnames(df)[[1]] <- paste(rep(dimnames(tab1)[[1]], each =2), rep(c("(n)", "(prop)"), num.elements))
dimnames(df)[[2]] <- dimnames(tab1)[[2]]


opts <- options(knitr.kable.NA = "")

df[df=="NA"| df==" NA"] <- NA


kable(df, caption="Number (n) and proportion (prop) of individuals with reported death in each Wave (by baseline Wave) - each death appears only once, Sum gives the total number/proportion by baseline wave.")  %>% kable_styling()
Number (n) and proportion (prop) of individuals with reported death in each Wave (by baseline Wave) - each death appears only once, Sum gives the total number/proportion by baseline wave.
Wave 1 Wave 2 Wave 3 Wave 4 Wave 5 Wave 6 Wave 7 Sum
Wave 1 (n) 0 66 97 89 112 92 84 540
Wave 1 (prop) 0.00 0.06 0.10 0.10 0.13 0.12 0.13 0.34
Wave 2 (n) 0 36 57 58 53 76 280
Wave 2 (prop) 0.00 0.04 0.07 0.07 0.07 0.12 0.22
Wave 4 (n) 0 3 3 5 11
Wave 4 (prop) 0.00 0.01 0.01 0.02 0.03
Wave 5 (n) 0 54 90 144
Wave 5 (prop) 0.00 0.04 0.07 0.08
Wave 6 (n) 0 3 3
Wave 6 (prop) 0.00 0.01 0.01

About 20% of the participants have missing value at the second measurement occasion; in later measurement occasions the number of missing data does not increase as dramatically. It is interesting to note that, taking the reported deaths into account, the number of missing values decreases in later interviews (Wave 1, measurement occasions 6 and 7).

5.2.2.3 Descriptive statistics comparing the baseline characteristics by type of missingness

In this section we evaluate in more detail the association between missingness and measured characteristics of the participants. The characteristics are compared using descriptive statistics, and baseline characteristics are compared among groups of participants (with complete response, lost to follow-up, with intermittent missingness, that die during the study).

Participants were categorized in those with complete information, intermittent missingness (at least one missing interview followed by at least one valid interview), lost to follow up (only missing interviews from a certain point on or defined as out-of-sample), not part of the household, and with reported death during study and compared by their baseline characteristics. See the definitions in Section 2, types of missing values.

sharew1.baseline <- sharew1.baseline %>%  mutate(typeMissing_5cat_cv =factor(sharew1.baseline$typeMissing_5cat_cv, labels = c("Complete", "Death",  "Intermittent missing", "Lost to follow up",    "Out-of-household"   )    ))

s <-
    Hmisc::summaryM(
       # Wave+ 
      gender+ #country+ 
          age_int+ age_int_cat+ weight+ height_imp+ education_imp+ pa_vig_freq+  pa_low_freq+ cusmoke_imp+ maxgrip ~ factor(typeMissing_5cat_cv),
        data = sharew1.baseline,
        overall = FALSE,
        test = FALSE
    )

Hmisc::html(
    s,
    caption = 'Baseline characteristics by type of missingness',
    exclude1 = TRUE,
    npct = 'both',
    digits = 3,
    prmsd = TRUE,
    brmsd = TRUE,
    msdsize = mu$smaller2
)
Baseline characteristics by type of missingness.
N
Complete
N=2681
Death
N=978
Intermittent missing
N=476
Lost to follow up
N=1296
Out-of-household
N=21
gender : Female 5452 0.54 1440/2681 0.51 494/ 978 0.50 240/ 476 0.53 687/1296 0.38 8/ 21
age_int 5452 52.00 58.00 66.00
60.28 ±  8.79
66.00 75.00 81.00
73.29 ± 10.44
52.00 58.00 64.00
59.55 ±  8.18
53.00 58.00 66.00
60.20 ±  8.57
51.00 54.00 59.00
55.95 ±  6.41
age_int_cat : 50-59 5452 0.54 1452/2681 0.12 120/ 978 0.59 282/ 476 0.54 705/1296 0.81 17/ 21
  60-69 0.29 780/2681 0.21 202/ 978 0.27 127/ 476 0.30 390/1296 0.14 3/ 21
  70-80 0.14 384/2681 0.41 399/ 978 0.13 62/ 476 0.13 166/1296 0.05 1/ 21
  80+ 0.02 65/2681 0.26 257/ 978 0.01 5/ 476 0.03 35/1296 0.00 0/ 21
weight 5361 66.0 76.0 86.0
77.2 ± 15.2
62.5 71.0 81.0
72.7 ± 15.0
65.0 76.0 85.0
77.1 ± 15.6
66.0 75.0 86.0
76.9 ± 15.0
68.0 78.0 90.0
78.6 ± 14.5
height_imp 5418 165.00 172.00 178.00
171.82 ±  9.04
163.00 169.00 175.00
169.34 ±  8.80
165.00 172.00 178.00
171.66 ±  8.98
165.00 172.00 178.00
172.01 ±  9.40
165.00 173.00 185.00
174.67 ±  10.26
education_imp : Low 5428 0.17 447/2678 0.38 371/ 969 0.19 90/ 472 0.22 282/1288 0.05 1/ 21
  Medium 0.38 1019/2678 0.39 375/ 969 0.41 195/ 472 0.41 531/1288 0.48 10/ 21
  High 0.45 1212/2678 0.23 223/ 969 0.40 187/ 472 0.37 475/1288 0.48 10/ 21
pa_vig_freq 5423 0.67 1798/2677 0.35 339/ 965 0.66 311/ 473 0.63 810/1287 0.76 16/ 21
pa_low_freq 5422 0.94 2512/2677 0.73 707/ 964 0.95 447/ 473 0.93 1200/1287 0.95 20/ 21
cusmoke_imp : Yes 5423 0.22 590/2679 0.34 327/ 963 0.27 126/ 472 0.27 343/1288 0.43 9/ 21
maxgrip 5272 29.0 36.0 48.0
38.5 ± 12.5
21.5 29.0 38.0
30.3 ± 11.9
28.0 37.5 49.0
38.5 ± 13.1
28.0 36.0 48.0
38.3 ± 12.9
35.0 50.0 54.0
43.9 ± 13.1
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.

Deaths were more commonly observed among men, participants that were older and had lower education and that reported less physical activity and more smoking, and considerably lower levels of grip strength . Respondents and non responders for reasons other than death were similar in their baseline charactheristics, other than for education (higher among complete responders); complete responders smoked less frequently than non-responders. Participants with intermittent missingness were slightly younger than others. The characteristics of the small group of participants out-of-sample (household) indicated that these was a younger group.

Similar results were obtained also when the analysis was conducted within each wave or when the groups were compared using their missing status at second measurement occasion (data not shown).

5.2.2.4 Deaths: additional details

5.2.2.4.1 Quality of reporting of deaths

Overall, the quality of reporting deaths in data from Denmark was very good. As shown below, the vital status was unknown for few participants and the deaths were reported timely.

The table below shows the number of participants stratified by dead/alive status last available information (Wave 7), as reported in the coverscreen data.

df <- table(cv.all$deadoralive_w7)
df.prop   <- round(df / sum(df) * 100, 1)

df <- data.frame(t(c(df, df.prop)[c(1, 4, 2, 5, 3, 6)]))



names(df) <- c("n", "%", "n", "%", "n", "%")

#kable(df)%>% kable_styling()


df %>% kable(digits = 1, caption = "Number (n) and percentage (%) of participants classified by dead/alive status at last available wave, per country") %>%  add_header_above(c(
  "Unknown" = 2,
  "Alive" = 2,
  "Dead" = 2
)) %>% kable_styling()
Number (n) and percentage (%) of participants classified by dead/alive status at last available wave, per country
Unknown
Alive
Dead
n % n % n %
53 1 4421 81.1 978 17.9

In Denmark the percentage of participants with unknow vital status at the end of the study was only 1% (data from Denmark are linked with the population registry).

5.2.2.5 Out-of-sample: additional details

We explored the characteristics of the participants that were categorized as out-of-sample.

Overall, 382 participants were categorized as out-of-sample at some point during the study, 366 of which in Wave 7.

# separating out-of-sample and out-of-household, using missing.occasion.4_cv instead of missing.occasion.3_cv


combinations.outOfSample <-
  missing.occasion.4_cv[apply(missing.occasion.4_cv, 1, function(x)
    any(x == -1000)), ] %>% count() %>% arrange(-freq)

combinations.outOfSample <-  combinations.outOfSample[-1, ]
dimnames(combinations.outOfSample)[[2]] <-
  c(paste0("M", c(1:7)), "freq")

Here we display the the combinations with at least two participants (covering all but 9 out-of-sample participants); note that code -1000 indicates that a participant is out-of-sample, code -10 that the interview is missing.

print.data.frame(combinations.outOfSample[1:10,])
##    M1    M2    M3    M4    M5    M6    M7 freq
## 2   1   -10   -10   -10   -10 -1000    NA  102
## 3   1   -10   -10   -10   -10   -10 -1000   95
## 4   1     1   -10   -10   -10   -10 -1000   68
## 5   1     1   -10   -10   -10 -1000    NA   57
## 6   1     1     1   -10   -10   -10 -1000   33
## 7   1   -10     1   -10   -10   -10 -1000   10
## 8   1 -1000 -1000 -1000 -1000 -1000 -1000    2
## 9   1 -1000 -1000    NA    NA    NA    NA    2
## 10  1   -10 -1001 -1000 -1000 -1000 -1000    2
## 11  1     1 -1000 -1000    NA    NA    NA    2

We can observe that the vast majority of participants with interview Out-of-sample (code -1000) are excluded from the study after 3, 4 or 5 missing interviews (code -10). This indicates that in Denmark some rules that would exclude participants with long non-response were used to exclude participants from the study

The detailed exploration of metadata confirmed this finding (from Wave 7, the participants that did not participate for 3 consecutive interviews, or for which the end-of-life interview was not completed in two waves, were categorized as out of sample).

This indicates that it is appropriate to interpret participants classified as out-of-sample as participants lost to follow-up.

5.2.2.6 Out-of-household: additional details

Overall, only 34 participants were categorized as out-of-household at some point during the study, the numbers were rather uniform across waves.

# separating out of sample and out-of-household, using missing.occasion.4_cv instead of missing.occasion.3_cv

combinations.outOfSample <-
  missing.occasion.4_cv[apply(missing.occasion.4_cv, 1, function(x)
    any(x == -1001)), ] %>% count() %>% arrange(-freq)

combinations.outOfSample <-  combinations.outOfSample[-1, ]
dimnames(combinations.outOfSample)[[2]] <-
  c(paste0("M", c(1:7)), "freq")
print.data.frame(combinations.outOfSample)
##    M1    M2    M3    M4    M5    M6    M7 freq
## 2   1   -10 -1001    NA    NA    NA    NA    3
## 3   1   -10 -1001 -1000 -1000 -1000 -1000    2
## 4   1     1 -1001    NA    NA    NA    NA    2
## 5   1     1     1     1     1 -1001    NA    2
## 6   1 -1001 -1001 -1001 -1001 -1000    NA    1
## 7   1 -1001 -1000 -1000 -1000  -100    NA    1
## 8   1 -1001  -100  -100  -100  -100  -100    1
## 9   1 -1001   -10 -1000  -100  -100  -100    1
## 10  1 -1001     1     1     1     1     1    1
## 11  1 -1001     1    NA    NA    NA    NA    1
## 12  1   -10 -1001 -1001 -1001 -1001    NA    1
## 13  1   -10 -1001 -1001   -10  -100    NA    1
## 14  1   -10   -10 -1001     1     1  -100    1
## 15  1   -10   -10 -1001     1     1     1    1
## 16  1   -10   -10   -10 -1001 -1001    NA    1
## 17  1   -10   -10     1     1     1 -1001    1
## 18  1     1 -1001 -1001 -1001     1     1    1
## 19  1     1 -1001 -1001     1     1     1    1
## 20  1     1 -1001 -1001    NA    NA    NA    1
## 21  1     1 -1001     1     1     1     1    1
## 22  1     1   -10 -1001 -1001 -1001    NA    1
## 23  1     1   -10 -1001   -10   -10   -10    1
## 24  1     1   -10   -10   -10 -1001     1    1
## 25  1     1   -10     1     1 -1001    NA    1
## 26  1     1     1 -1001   -10   -10  -100    1
## 27  1     1     1     1 -1001     1    NA    1
## 28  1     1     1     1     1 -1001 -1001    1
## 29  1     1     1     1     1 -1001  -100    1
## 30  1     1     1     1     1     1 -1001    1

We can observe that most of the combinations appear only once. In some cases participants are categorized as out of sample after having been out of the household. In few cases the participants that were out of the household re-enter the study (are interviewed again or appear as having missing interviews). Given the very small number of participants in this group, their further study does not seem of interest. In the statistical analyses these observations should be treated as missing by design.

5.2.3 Variable missingness (item missingness, M4)

Here we describe the missing values for the variables included in the analysis strategy (AS) as appearing in the model addressing the primary research question. The analysis is restricted to the statistical units for which the interviews were available (unit missingness is not addressed in this section). We explore also the amount of missing outcomes among the interviews that were conducted.

5.2.3.1 Item missingness at baseline interview, overall and by sex

Number and percentage of missing values at baseline interview.

tab1 <- sharew1.baseline %>%
  select(
    age_int,
    weight,
    height_imp,
    gender,
    education_imp,
    pa_vig_freq ,
    pa_low_freq,
    cusmoke_imp,
    maxgrip
  ) %>%
  miss_var_summary(order = FALSE)


tab2 <- sharew1.baseline %>%
  select(
    age_int,
    weight,
    height_imp,
    gender,
    education_imp,
    pa_vig_freq ,
    pa_low_freq,
    cusmoke_imp,
    maxgrip
  ) %>% group_by(gender) %>%
  miss_var_summary(order = FALSE)

tab <-
  left_join(tab1, filter(tab2, gender == "Female"), by = "variable") %>%
  left_join(filter(tab2, gender ==
                     "Male"), by = "variable") %>% arrange(-n_miss.x) %>% rename(
                       "Missing (count)" = n_miss.x ,
                       "Missing (%)" = pct_miss.x,
                       "F: Missing (count)" = n_miss.y ,
                       "F: Missing (%)" = pct_miss.y ,
                       "M: Missing (count)" = n_miss,
                       "M: Missing (%)" = pct_miss
                     ) %>% select(!starts_with("gender"))


kable(
  tab,
  col.names = c(
    "Variable",
    "Missing (count)",
    "Missing (%)",
    "Missing (count)",
    "Missing (%)",
    "Missing (count)",
    "Missing (%)"
  ),
  digits = c(0, 0, 2, 0, 2, 0, 2)
) %>% add_header_above(c(
  " " = 1,
  "All" = 2,
  "Females" = 2,
  "Males" = 2
)) %>%   kable_styling()
All
Females
Males
Variable Missing (count) Missing (%) Missing (count) Missing (%) Missing (count) Missing (%)
maxgrip 180 3.30 109 3.80 71 2.75
weight 91 1.67 73 2.54 18 0.70
height_imp 34 0.62 21 0.73 13 0.50
pa_low_freq 30 0.55 13 0.45 17 0.66
pa_vig_freq 29 0.53 12 0.42 17 0.66
cusmoke_imp 29 0.53 13 0.45 16 0.62
education_imp 24 0.44 10 0.35 14 0.54
age_int 0 0.00 0 0.00 0 0.00
gender 0 0.00
##### Item missingness at baseline interview, by sex
sharew1.baseline %>%
  select(
    weight,
    height_imp,
    education_imp ,
    pa_vig_freq ,
    pa_low_freq,
    cusmoke_imp,
    maxgrip,
    gender
  ) %>%
  group_by(gender) %>% gg_miss_var(show_pct = TRUE) + facet_wrap(gender ~
                                                                   ., ncol = 3)

Overall, the number of missing items at baseline is very small, the maxgrip outcome variable was the variable with most missing values (2.5%). Age and sex were not missing for any of the participants at the baseline interview. Also in longitudinal interviews age and sex were not missing for any of the participants. For this reason these variables were omitted from further summaries of missing values.

Also when stratified by sex the percentages of item missing values were low, weight was missing more frequently for women.

5.2.3.2 Item missingness at baseline interview, by age group

tab1 <- sharew1.baseline %>%
  select(weight,
         height_imp,
         education_imp,
         pa_vig_freq ,
         pa_low_freq,
         cusmoke_imp,
         maxgrip) %>%
  miss_var_summary(order = FALSE)


tab2 <- sharew1.baseline %>%
  select(
    age_int_cat,
    weight,
    height_imp,
    education_imp,
    pa_vig_freq ,
    pa_low_freq,
    cusmoke_imp,
    maxgrip
  ) %>% group_by(age_int_cat) %>%
  miss_var_summary(order = FALSE)

tab <-
  left_join(tab1, filter(tab2, age_int_cat == "50-59"), by = "variable") %>% left_join(filter(tab2, age_int_cat ==
                                                                                                "60-69"), by = "variable") %>% left_join(filter(tab2, age_int_cat == "70-80"), by =
                                                                                                                                           "variable") %>% left_join(filter(tab2, age_int_cat == "80+"), by = "variable") %>% arrange(-n_miss.x) %>% rename(
                                                                                                                                             "Missing (count)" = n_miss.x,
                                                                                                                                             "Missing (%)" = pct_miss.x,
                                                                                                                                             "50-59: Missing (count)" = n_miss.y,
                                                                                                                                             "50-59: Missing (%)" = pct_miss.y ,
                                                                                                                                             "60-69: Missing (count)" = n_miss.x.x,
                                                                                                                                             "60-69: Missing (%)" = pct_miss.x.x,
                                                                                                                                             "70-80: Missing (count)" = n_miss.y.y,
                                                                                                                                             "70-80: Missing (%)" = pct_miss.y.y,
                                                                                                                                             "80+: Missing (count)" = n_miss,
                                                                                                                                             "80+: Missing (%)" = pct_miss
                                                                                                                                           ) %>% select(!starts_with("age_int_cat"))



kable(
  tab,
  col.names = c(
    "Variable",
    "Missing (count)",
    "Missing (%)",
    "Missing (count)",
    "Missing (%)",
    "Missing (count)",
    "Missing (%)",
    "Missing (count)",
    "Missing (%)",
    "Missing (count)",
    "Missing (%)"
  ),
  digits = c(0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2)
) %>% add_header_above(c(
  " " = 1,
  "All" = 2,
  "50-59" = 2,
  "60-69" = 2,
  "70-80" = 2,
  "80+" = 2
)) %>%   kable_styling() %>%
  scroll_box(width =  "100%", height = "450px")
All
50-59
60-69
70-80
80+
Variable Missing (count) Missing (%) Missing (count) Missing (%) Missing (count) Missing (%) Missing (count) Missing (%) Missing (count) Missing (%)
maxgrip 180 3.30 57 2.21 35 2.33 43 4.25 45 12.43
weight 91 1.67 30 1.16 21 1.40 22 2.17 18 4.97
height_imp 34 0.62 6 0.23 4 0.27 11 1.09 13 3.59
pa_low_freq 30 0.55 10 0.39 5 0.33 7 0.69 8 2.21
pa_vig_freq 29 0.53 11 0.43 5 0.33 5 0.49 8 2.21
cusmoke_imp 29 0.53 10 0.39 5 0.33 6 0.59 8 2.21
education_imp 24 0.44 13 0.50 4 0.27 3 0.30 4 1.10

When stratified by age groups, the percentages of item missing values somehow increased with age, most notably for the outcome variable.

5.2.3.3 Item missingness at baseline interview, by Wave

The summary of missingness by wave are useful for visualizing possible heterogeneity across waves.

sharew1.baseline %>%
     select(weight, height_imp,  education_imp , pa_vig_freq , pa_low_freq, cusmoke_imp,  maxgrip, Wave) %>%
     group_by(Wave) %>% gg_miss_var(show_pct = TRUE) + facet_wrap(Wave~., ncol=3)

Baseline interviews taken in different waves do not differ substantially in terms of missing values. Note that Wave 4 and 6 had a small number of baseline interviews, therefore deviations of percentages of missing values from the other waves should not be overinterpreted.

5.2.3.4 Item missingness at baseline and longitudinal interviews, by Wave

The variable that had the most problematic behaviour as of missing values in the longitudinal interviews was current smoking. The current smoking information was not recorded in longitudinal interviews in wave 6 and 7, nor in SHARELIFE wave 3 interviews, while it was recorded in baseline iterviews in all waves. The generated variable cusmoke provided by SHARE, which should report the current smoking, was not defined in wave 6 and 7, even when the data were available (with baseline interviews).

table(share$cusmoke_imp, share$Wave, exclude=NULL)
##       
##        Wave 1 Wave 2 Wave 3 Wave 4 Wave 5 Wave 6 Wave 7
##   No     1091   1782      0   1590   3138    210     41
##   Yes     499    649      0    488    775     56    165
##   <NA>      6     56   1979     34      6   3248   2819

The following summaries will use the variable smoking at baseline (time-fixed).

In the following summaries we consider only variables that are time-varying in the analysis strategy (education and height and smoking at baseline are excluded).

Number (n) and percentage (%) of missing values per variable, by Wave
Wave 1
Wave 2
Wave 3
Wave 4
Wave 5
Wave 6
Wave 7
Variable n % n % n % n % n % n % n %
weight 26 1.63 46 1.85 1979 100.00 36 1.70 60 1.53 59 1.68 70 2.31
pa_vig_freq 5 0.31 57 2.29 1979 100.00 35 1.66 6 0.15 5 0.14 1836 60.69
pa_low_freq 4 0.25 57 2.29 1979 100.00 35 1.66 8 0.20 5 0.14 1835 60.66
maxgrip 66 4.14 91 3.66 70 3.54 78 3.69 142 3.62 107 3.04 149 4.93
share1 %>% select(weight, pa_vig_freq , pa_low_freq, maxgrip, Wave) %>% group_by(Wave) %>%   gg_miss_var(show_pct = TRUE) + facet_wrap(Wave~., ncol=3)

Between waves there are big differences in terms of missing values, especially for SHARELIFE interviews (Wave 3 and partly Wave 7, where a part of the inverviews are SHARELIFE interviews), where some variables are missing by design. The outcome variable had a proportion of missing values that was roughly comparable among waves, missing more often in the last Wave.

In our study the missingness by design in the SHARELIFE interviews of the two variables about physical activity is the most problematic aspect; weight is missing by design in wave 3 but not in wave 7 SHARELIFE interviews. This characteristic indicates that complete case analysis would not be feasible if weight and physical activity are used as explanatory variables in the models.

To further explore the effect of waves and/or baseline vs longitudinal interviews, we repeated the analyses stratifying the results by type of interview (baseline, longitudinal or SHARELIFE interviews). This graph makes the missingness by design easier to understand.

share1 %>% select(weight, pa_vig_freq , pa_low_freq, maxgrip, Wave, measurement_occasion) %>% 
  mutate (first=ifelse(measurement_occasion==1, "Baseline", "Longitudinal")) %>% select (-measurement_occasion) %>%
  group_by(Wave, first) %>%   gg_miss_var(show_pct = TRUE) + facet_grid(Wave~first)


ggsave("figures/itemNA_bywaveType.png", device="png", scale=1, width =7, height=7, units="in", pointsize=12, bg="transparent")

Some variables are missing by design in longitudinal or SHARELIFE interviews, for example current smoking in Wave 6 and 7, or height in Wave 4 and 7. If both variables are used as time fixed variables, measured at baseline, this does not constitute a problem in our study. Other variables missing by design: physical activity variables in SHARELIFE interviews, weight in SHARELIFE interviews of Wave 3.

p <- share1 %>% filter(!is.na(typeQuest)) %>% select(weight, height, pa_vig_freq , pa_low_freq, maxgrip, cusmoke_imp, Wave, measurement_occasion, typeQuest) %>%  rename(`PA vigorous`=pa_vig_freq,
         `PA moderate`=pa_low_freq,
         `Current smoking`=cusmoke_imp,
         `Weight`=weight,
         `Height` = height,
         `Grip strength` = maxgrip
         ) %>%   select (-measurement_occasion) %>%  group_by(Wave, typeQuest) %>%   gg_miss_var(show_pct = TRUE) + facet_grid(Wave~typeQuest) +  geom_rect(xmin = -Inf,xmax = Inf,  ymin = -Inf,ymax = Inf,alpha = 0, color="black")



# add sample size per panel

df <- share1 %>% filter(!is.na(typeQuest))
  
dat_text <- data.frame(
  label = paste("n=", c(table(df$Wave, df$typeQuest))),
  Wave   = rep(c("Wave 1", "Wave 2", "Wave 3", "Wave 4", "Wave 5",
              "Wave 6", "Wave 7"), 3),
  typeQuest = rep(c("Baseline questionnaire", "Longitudinal questionnaire","SHARELIFE"), each =7)
)

# label top aligned
#p + geom_text(
#  data    = dat_text,
#  mapping = aes(x = Inf, y = Inf, label = label),
#  hjust   = 1.2,
#  vjust   = 1.2, 
#  size = 3
#)

  p + geom_text(
          data    = dat_text,
          mapping = aes(x = -Inf, y = Inf, label = label),
          hjust   = 1.1,
          vjust   = -0.5, 
          size = 3
      )

ggsave("figures/itemNA_bywaveType2_v3.png", device="png", scale=1, width =7, height=7, units="in", pointsize=12, bg="transparent")

ggsave("figuresSubmission/fig9.tif", scale=1, width =7, height=7, units="in", pointsize=12, bg="transparent", compression="lzw", device="tiff")

5.2.3.5 Item missingness by measurement occasion (removing the missing by design missingness)

Here we evaluate the percentages of missing values, taking into account in which interviews the values are missing by design (and excluding them).

The proportion of participants with missing values of some time-varying variables is very small if the measurements where the variables are missing by design are not considered. Only for the outcome variable we observed that the proportion of participants with missing values in the outcome (and valid interview) increased at later measurement occasions.

#number of observations where there is no missing by design
# number of missing / number evaluable
num.noMBD <-
  share1 %>% mutate(
    pa_vig_freq = ifelse(typeQuest == "SHARELIFE",-9999, pa_vig_freq),
    pa_low_freq = ifelse(typeQuest == "SHARELIFE",-9999, pa_low_freq),
    weight = ifelse(Wave == "Wave 3",-9999, weight)
  ) %>%
  select(weight,
         pa_vig_freq ,
         pa_low_freq,
         maxgrip,
         measurement_occasion) %>% group_by(measurement_occasion) %>%
  summarise_all(.funs = list(
    num.NA = ~ sum(x = is.na(.)),
    num.noMBD = ~ length(x = .[. != -9999])
  ))


tab1 <- select(num.noMBD,!ends_with("NA"))
tab2 <- select(num.noMBD, starts_with("measurement") | ends_with("NA"))
tab3 <- tab2 / tab1 * 100

dimnames(tab1)[[2]] <-
  dimnames(tab2)[[2]] <- dimnames(tab3)[[2]] <- c("MO",
                                                  "weight", "pa_vig_freq", "pa_low_freq",  "maxgrip")

tab1 <- t(tab1)
tab2 <- t(tab2)
tab3 <- t(tab3)


tab <- rbind(tab3[1, ], tab2[1, ], tab1[1, ])

for (i in 2:nrow(tab1))
  
  tab <- rbind.data.frame(tab, tab3[i, ], tab2[i, ], tab1[i, ])


tab <- tab[-c(1:3), ]

dimnames(tab)[[1]] <-
  paste0(rep(c(
    "weight", "pa_vig_freq", "pa_low_freq", "maxgrip"
  ), each = 3), rep(c(": % NA", ": NA", ": n"), 4))


# uses function defined in the index file that specifies the format of the output by setting the format in the data frame


tab <- digitsByRows(data.frame(tab), rep(c(2, 0, 0), 4))

dimnames(tab)[[1]] <-
  paste0(rep(c(
    "weight", "pa_vig_freq", "pa_low_freq", "maxgrip"
  ), each = 3), rep(c(": % NA", ": NA", ": n"), 4))



tab <-
  cbind.data.frame(" " = rep(c("% NA", "NA", "n"), nrow(tab) / 3), tab)

kable(
  tab ,
  row.names = FALSE,
  ,
  caption = "Number (n) and percentage (%) of missing values per variable, by measurement occasion",
  col.names = c(" ", "M1", "M2", "M3", "M4", "M5", "M6", "M7")
) %>%
  pack_rows(index = table(fct_inorder(rep(
    c("weight", "pa_vig_freq", "pa_low_freq", "maxgrip") , each = 3
  )))) %>% kable_styling()
Number (n) and percentage (%) of missing values per variable, by measurement occasion
M1 M2 M3 M4 M5 M6 M7
weight
% NA 1.67 1.49 2.19 2.15 1.77 1.79 1.58
NA 91 48 52 43 28 25 10
n 5452 3216 2379 1999 1581 1394 632
pa_vig_freq
% NA 0.53 1.40 1.33 1.16 0.13 0.22 0.00
NA 29 42 15 20 2 3 0
n 5452 3008 1131 1718 1581 1362 566
pa_low_freq
% NA 0.55 1.40 1.33 1.16 0.13 0.15 0.00
NA 30 42 15 20 2 2 0
n 5452 3008 1131 1718 1581 1362 566
maxgrip
% NA 3.30 2.90 3.93 4.20 4.24 5.31 6.96
NA 180 122 132 84 67 74 44
n 5452 4211 3363 1999 1581 1394 632

By sex

#number of observations where there is no missing by design
# number of missing / number evaluable
num.noMBD <-
  share1 %>% mutate(
    pa_vig_freq = ifelse(typeQuest == "SHARELIFE",-9999, pa_vig_freq),
    pa_low_freq = ifelse(typeQuest == "SHARELIFE",-9999, pa_low_freq),
    weight = ifelse(Wave == "Wave 3",-9999, weight)
  ) %>% filter(gender == "Female") %>%
  select(weight,
         pa_vig_freq ,
         pa_low_freq,
         maxgrip,
         measurement_occasion) %>% group_by(measurement_occasion) %>%
  summarise_all(.funs = list(
    num.NA = ~ sum(x = is.na(.)),
    num.noMBD = ~ length(x = .[. != -9999])
  ))


tab1 <- select(num.noMBD,!ends_with("NA"))
tab2 <-
  select(num.noMBD, starts_with("measurement") | ends_with("NA"))
tab3 <- tab2 / tab1 * 100

dimnames(tab1)[[2]] <-
  dimnames(tab2)[[2]] <- 
    dimnames(tab3)[[2]] <-  c("MO", "weight", "pa_vig_freq", "pa_low_freq",  "maxgrip")

tab1 <- t(tab1)
tab2 <- t(tab2)
tab3 <- t(tab3)


tab <- rbind(tab3[1, ], tab2[1, ], tab1[1, ])

for (i in 2:nrow(tab1))
  
  tab <- rbind.data.frame(tab, tab3[i, ], tab2[i, ], tab1[i, ])


tab <- tab[-c(1:3), ]

dimnames(tab)[[1]] <-
  paste0(rep(c(
    "weight", "pa_vig_freq", "pa_low_freq", "maxgrip"
  ), each = 3), rep(c(": % NA", ": NA", ": n"), 4))

# uses function defined in the index file that specifies the format of the output by setting the format in the data frame

tab <- digitsByRows(data.frame(tab), rep(c(2, 0, 0), 4))

dimnames(tab)[[1]] <-
  paste0(rep(c(
    "weight", "pa_vig_freq", "pa_low_freq", "maxgrip"
  ), each = 3), rep(c(": % NA", ": NA", ": n"), 4))

tabF <-
  cbind.data.frame(" " = rep(c("% NA", "NA", "n"), nrow(tab) / 3), tab)
#number of observations where there is no missing by design
# number of missing / number evaluable
num.noMBD <-
  share1 %>% mutate(
    pa_vig_freq = ifelse(typeQuest == "SHARELIFE",-9999, pa_vig_freq),
    pa_low_freq = ifelse(typeQuest == "SHARELIFE",-9999, pa_low_freq),
    weight = ifelse(Wave == "Wave 3",-9999, weight)
  ) %>% filter(gender == "Male") %>%
  select(weight,
         pa_vig_freq ,
         pa_low_freq,
         maxgrip,
         measurement_occasion) %>% group_by(measurement_occasion) %>%
  summarise_all(.funs = list(
    num.NA = ~ sum(x = is.na(.)),
    num.noMBD = ~ length(x = .[. != -9999])
  ))

tab1 <- select(num.noMBD,!ends_with("NA"))
tab2 <-
  select(num.noMBD, starts_with("measurement") | ends_with("NA"))
tab3 <- tab2 / tab1 * 100

dimnames(tab1)[[2]] <-
  dimnames(tab2)[[2]] <-
  dimnames(tab3)[[2]] <-
  c("MO", "weight", "pa_vig_freq", "pa_low_freq", "maxgrip")

tab1 <- t(tab1)
tab2 <- t(tab2)
tab3 <- t(tab3)


tab <- rbind(tab3[1, ], tab2[1, ], tab1[1, ])

for (i in 2:nrow(tab1))
  
  tab <- rbind.data.frame(tab, tab3[i, ], tab2[i, ], tab1[i, ])


tab <- tab[-c(1:3), ]

dimnames(tab)[[1]] <-
  paste0(rep(c(
    "weight", "pa_vig_freq", "pa_low_freq", "maxgrip"
  ), each = 3), rep(c(": % NA", ": NA", ": n"), 4))


# uses function defined in the index file that specifies the format of the output by setting the format in the data frame
tab <- digitsByRows(data.frame(tab), rep(c(2, 0, 0), 4))

dimnames(tab)[[1]] <-
  paste0(rep(c(
    "weight", "pa_vig_freq", "pa_low_freq", "maxgrip"
  ), each = 3), rep(c(": % NA", ": NA", ": n"), 4))

tabM <-
  cbind.data.frame(" " = rep(c("% NA", "NA", "n"), nrow(tab) / 3), tab)
tab <- rbind.data.frame(
  tabM[1, ],
  paste(tabM[2, ], tabM[3, ], sep = "/"),
  tabF[1, ],
  paste(tabF[2, ], tabF[3, ], sep = "/"),
  tabM[4, ],
  paste(tabM[5, ], tabM[6, ], sep = "/"),
  tabF[4, ],
  paste(tabF[5, ], tabF[6, ], sep = "/"),
  tabM[7, ],
  paste(tabM[8, ], tabM[9, ], sep = "/"),
  tabF[7, ],
  paste(tabF[8, ], tabF[9, ], sep = "/"),
  tabM[10, ],
  paste(tabM[11, ], tabM[12, ], sep = "/"),
  tabF[10, ],
  paste(tabF[11, ], tabF[12, ], sep = "/")
  
  
  
)

kable(
  tab ,
  row.names = FALSE,
  ,
  caption = "Number (n) and percentage (%) of missing values per variable, by measurement occasion by sex",
  col.names = c(" ",  "M1", "M2", "M3", "M4", "M5", "M6", "M7")
) %>%
  pack_rows(index = table(fct_inorder(paste(
    rep(c(
      "weight", "pa_vig_freq", "pa_low_freq", "maxgrip"
    ) , each = 4), rep(c("Males", "Females"), each = 2)
  )))) %>% kable_styling() %>% scroll_box(width = "100%", height = "100%")
Number (n) and percentage (%) of missing values per variable, by measurement occasion by sex
M1 M2 M3 M4 M5 M6 M7
weight Males
% NA 0.70 0.65 0.63 0.96 0.83 1.24 1.02
NA/n 18/2583 10/1528 7/1107 9/940 6/720 8/646 3/294
weight Females
% NA 2.54 2.25 3.54 3.21 2.56 2.27 2.07
NA/n 73/2869 38/1688 45/1272 34/1059 22/861 17/748 7/338
pa_vig_freq Males
% NA 0.66 1.20 0.57 1.01 0.28 0.48 0.00
NA/n 17/2583 17/1419 3/526 8/793 2/720 3/630 0/257
pa_vig_freq Females
% NA 0.42 1.57 1.98 1.30 0.00 0.00 0.00
NA/n 12/2869 25/1589 12/605 12/925 0/861 0/732 0/309
pa_low_freq Males
% NA 0.66 1.20 0.57 1.01 0.28 0.32 0.00
NA/n 17/2583 17/1419 3/526 8/793 2/720 2/630 0/257
pa_low_freq Females
% NA 0.45 1.57 1.98 1.30 0.00 0.00 0.00
NA/n 13/2869 25/1589 12/605 12/925 0/861 0/732 0/309
maxgrip Males
% NA 2.75 2.02 2.50 2.87 3.19 4.49 5.10
NA/n 71/2583 40/1983 39/1562 27/940 23/720 29/646 15/294
maxgrip Females
% NA 3.80 3.68 5.16 5.38 5.11 6.02 8.58
NA/n 109/2869 82/2228 93/1801 57/1059 44/861 45/748 29/338
# for the paper, not printed

#  kable(tab , row.names=FALSE,     , caption="Number (n) and percentage (%) of missing values per variable, by measurement occasion by sex", col.names = c(" ",  "M1", "M2", "M3", "M4", "M5", "M6", "M7"), format="latex") %>%   pack_rows(index = table(fct_inorder(paste(rep(c("weight", "pa_vig_freq", "pa_low_freq", "maxgrip") , each=4), rep(c("Males", "Females"), each =2)))))

5.2.3.6 Item missingness of outcome: additional details

We restricted the attention to item missingness of maxgrip across measurement occasions (maxgrip missing, interview performed).

Outcome missingenss was between 3.3 and 7% across measurement occasions. `

mg.occasion.NA <- mg.occasion
mg.occasion.NA[(missing.occasion.5_cv != 1 |
                  is.na(missing.occasion.5_cv))] <- -999
mg.occasion.NA[is.na(missing.occasion.5_cv)] <- -999
mg.occasion.NA <- as.data.frame(mg.occasion.NA)
dimnames(mg.occasion.NA)[[2]] <- paste0("M", 1:7)

Note that the number of interviews across measurement occasions is not comparable, as less observations are available for later measurement occasions.

my.tab <-
  round(apply(mg.occasion.NA, 2, function(x)
    c(
      length(x[x != -999]), sum(is.na(x[x != -999])),  100 * mean(is.na(x[x !=
                                                                            -999]))
    )), 2)

dimnames(my.tab)[[1]] <-
  c("Number or participants", "Missing", "% Missing")

kable(my.tab, caption = "Number and percentage of missing values by measurement occasion", digits = 1) %>% kable_styling()
Number and percentage of missing values by measurement occasion
M1 M2 M3 M4 M5 M6 M7
Number or participants 5452.0 4211.0 3363.0 1999.0 1581.0 1394.0 632
Missing 180.0 122.0 132.0 84.0 67.0 74.0 44
% Missing 3.3 2.9 3.9 4.2 4.2 5.3 7

The table below gives the distribution of number of missing values in outcome by number of measurements (interviews available)

num.measured <-
  apply(mg.occasion.NA, 1, function(x)
    length(x[x != -999]))
num.missing <-
  apply(mg.occasion.NA, 1, function(x)
    sum(is.na(x[x != -999])))
num.available <- num.measured - num.missing


#add all missing outcome variable
#corrected bug March 2022
sharew1.baseline$outcome_allNA <-
  ifelse(num.measured == num.missing, 1, 0)

my.tab <- addmargins(table(num.measured,  num.missing), 2)

dimnames(my.tab)[[1]] <- paste("M =", c(1:7))
#dimnames(my.tab)[[2]] <- c(0:7, "n") # in Denmark there are no participants with 7 missing values, adapted to the size of the table
dimnames(my.tab)[[2]][ncol(my.tab)] <-
  "n" # in Denmark there are no participants with 7 missing values, adapted to the size of

kable(my.tab, caption = "Number and percentage of missing values by measurement occasion", digits = 2) %>% add_header_above(c(
  "Interviews" = 1,
  "Missing" = ncol(my.tab) - 1,
  "Total"
)) %>%
  kable_styling()
Number and percentage of missing values by measurement occasion
Interviews
Missing
Total
0 1 2 3 4 5 n
M = 1 877 88 0 0 0 0 965
M = 2 883 64 19 0 0 0 966
M = 3 1387 90 23 8 0 0 1508
M = 4 460 45 16 5 1 0 527
M = 5 248 38 15 3 2 1 307
M = 6 615 51 12 3 1 3 685
M = 7 448 35 5 5 1 0 494
5.2.3.6.1 Outcome missingness stratified by age and sex

Here we explore the association between age (time metric from analysis strategy) and outcome missingness. The probability of missing outcome considerably increased with age, especially for women. This is shown by using descriptive statistics of the proportion of missing outcomes by sex and age group, in the complete data set (using all observations).

tabM <-
  share1 %>% filter(gender == "Male") %$% table(!is.na(.$maxgrip), .$age_int_cat)

tabF <-
  share1 %>% filter(gender == "Female") %$% table(!is.na(.$maxgrip), .$age_int_cat)

tab <- rbind.data.frame(
  round(prop.table(tabM, 2)[1, ] * 100, 1),
  paste(tabM[1, ], tabM[2, ], sep = "/"),
  round(prop.table(tabF, 2)[1, ] * 100, 1),
  paste(tabF[1, ], tabF[2, ], sep = "/")
)


kable(tab,
      caption = "Number (n) and percentage (%) of missing values per variable, by age group and sex",
      col.names = c("50-59", "60-69", "70-79", "80+")) %>%
  pack_rows(index = table(fct_inorder(rep(
    c("Males", "Females") , each = 2
  )))) %>% kable_styling()
Number (n) and percentage (%) of missing values per variable, by age group and sex
50-59 60-69 70-79 80+
Males
1.5 1.9 3.1 11.4
45/2890 57/2989 63/1994 79/611
Females
2.4 2.7 6.2 13.8
77/3159 89/3226 140/2104 153/956
# printing for the paper in latex format
#kable(tab  , caption="Number (n) and percentage (%) of missing values per variable, by age group and sex", col.names = c( "50-59", "60-69", "70-79", "80+"), format = "latex") %>%   pack_rows(index = table(fct_inorder(rep(c("Males", "Females") , each=2))))%>% kable_styling()

Similar results are obtained using data from baseline interview only.

tabM <-
  sharew1.baseline %>% filter(gender == "Male") %$% table(!is.na(.$maxgrip), .$age_int_cat)
tabF <-
  sharew1.baseline %>% filter(gender == "Female") %$% table(!is.na(.$maxgrip), .$age_int_cat)

tab <- rbind.data.frame(
  round(prop.table(tabM, 2)[1, ] * 100, 1),
  paste(tabM[1, ], tabM[2, ], sep = "/"),
  round(prop.table(tabF, 2)[1, ] * 100, 1),
  paste(tabF[1, ], tabF[2, ], sep = "/")
)


kable(tab,
      caption = "Number (n) and percentage (%) of missing values per variable, by age group and sex",
      col.names = c("50-59", "60-69", "70-79", "80+")) %>%
  pack_rows(index = table(fct_inorder(rep(
    c("Males", "Females") , each = 2
  )))) %>% kable_styling()
Number (n) and percentage (%) of missing values per variable, by age group and sex
50-59 60-69 70-79 80+
Males
1.9 2.3 3.2 10.9
23/1207 17/717 15/457 16/131
Females
2.5 2.3 5.2 13.5
34/1312 18/750 28/512 29/186
# printing for the paper in latex format
#kable(tab  , caption="Number (n) and percentage (%) of missing values per variable, by age group and sex", col.names = c( "50-59", "60-69", "70-79", "80+"), format = "latex") %>%   pack_rows(index = table(fct_inorder(rep(c("Males", "Females") , each=2))))%>% kable_styling()

Also a graphical display is provided that use smoothers (method gam in the geom_smooth function) to estimate the probability of missing values by age (on baseline interview and for each separate wave). As the smoothers can produce unstable estimates, these graphs should not be over-interpreted.

p1 <- ggplot(filter(share.data.first.interview), aes(age_int, ifelse(is.na(maxgrip), 1, 0))) + geom_smooth(aes(fill=gender, color=gender)) + scale_fill_manual(values = c("blue", "pink")) + scale_color_manual(values = c("blue", "pink")) +  labs(x="Age at baseline", y="Probability of missing grip strength (geom_smooth)", color = "Sex", fill = "Sex") + theme_bw()


p2 <- ggplot(filter(share1), aes(age_int, ifelse(is.na(maxgrip), 1, 0))) + geom_smooth(aes(fill=gender, color=gender)) + scale_fill_manual(values = c("blue", "pink")) + scale_color_manual(values = c("blue", "pink")) +  labs(x="Age at interview", y="Probability of missing grip strength (geom_smooth)", color = "Sex", fill = "Sex")  + theme_bw() + facet_wrap(Wave~., nrow = 2)

p1+p2

ggsave("figures/AgeMissingMG.png", height = 4, width = 8, scale = 1.2 )
5.2.3.6.2 Description of the participants with outcome missing at all (avaialble) interviews

Overall, 117 participants had all missing values in the outcome at all measurement occasions (at valid interviews), most of them (n = 88, 75 %) were measured only once. There participants would typically not be considered in modelling (for example, if a random effect model is used).

Participants with all missing outcomes were older, were less physically active, were more commonly females, had lower education than those with some non missing outcome.

s <-
  Hmisc::summaryM(
    gender + age_int+ age_int_cat+ weight+ height_imp+ education_imp+  pa_vig_freq+  pa_low_freq+ cusmoke_imp ~ outcome_allNA,
    data = sharew1.baseline,
    overall = FALSE,
    test = FALSE
  )

Hmisc::html(
  s,
  caption = 'Baseline characteristics by all missing outcome (0: all NA, 1: not all NA outcomes)',
  exclude1 = TRUE,
  npct = 'both',
  digits = 3,
  prmsd = TRUE,
  brmsd = TRUE,
  msdsize = mu$smaller2, 
  scroll = TRUE
)
Baseline characteristics by all missing outcome (0: all NA, 1: not all NA outcomes).
N
0
N=5335
1
N=117
gender : Female 5452 0.52 2795/5335 0.63 74/ 117
age_int 5452 53.0 60.0 69.0
62.3 ± 10.1
59.0 74.0 84.0
72.6 ± 14.1
age_int_cat : 50-59 5452 0.48 2546/5335 0.26 30/ 117
  60-69 0.28 1482/5335 0.17 20/ 117
  70-80 0.18 981/5335 0.26 31/ 117
  80+ 0.06 326/5335 0.31 36/ 117
weight 5361 65.0 75.0 85.0
76.5 ± 15.2
60.0 70.0 82.0
71.7 ± 15.6
height_imp 5418 165.00 171.00 178.00
171.47 ±  9.12
164.00 168.00 172.25
168.84 ±  9.60
education_imp : Low 5428 0.21 1141/5320 0.46 50/ 108
  Medium 0.39 2092/5320 0.35 38/ 108
  High 0.39 2087/5320 0.19 20/ 108
pa_vig_freq 5423 0.61 3253/5320 0.20 21/ 103
pa_low_freq 5422 0.91 4842/5319 0.43 44/ 103
cusmoke_imp : Yes 5423 0.26 1370/5319 0.24 25/ 104
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.

Stratification by sex of the previous table, the results are similar to the complete analysis.

s <-
  Hmisc::summaryM(
     age_int+ age_int_cat+ weight+ height_imp+ education_imp+  pa_vig_freq+  pa_low_freq+ cusmoke_imp ~ outcome_allNA + gender,
    data = sharew1.baseline,
    overall = FALSE,
    test = FALSE
  )

Hmisc::html(
  s,
  caption = 'Baseline characteristics by all missing outcome by sex (0: all NA, 1: not all NA outcomes)',
  exclude1 = TRUE,
  npct = 'both',
  digits = 3,
  prmsd = TRUE,
  brmsd = TRUE,
  msdsize = mu$smaller2
)
Baseline characteristics by all missing outcome by sex (0: all NA, 1: not all NA outcomes).
N
0
N=2795
1
N=74
Male
age_int 2583 53.00 60.00 69.00
62.11 ±  9.79
57.50 69.00 81.00
70.12 ± 13.76
age_int_cat : 50-59 2583 0.48 1217/2540 0.30 13/ 43
  60-69 0.29 725/2540 0.21 9/ 43
  70-80 0.18 464/2540 0.19 8/ 43
  80+ 0.05 134/2540 0.30 13/ 43
weight 2565 75.0 82.0 92.0
84.0 ± 13.6
72.8 81.5 90.5
82.7 ± 14.8
height_imp 2570 173.00 178.00 183.00
178.12 ±  7.03
167.75 174.50 184.25
176.22 ±  9.29
education_imp : Low 2569 0.15 385/2531 0.32 12/ 38
  Medium 0.47 1187/2531 0.45 17/ 38
  High 0.38 959/2531 0.24 9/ 38
pa_vig_freq 2566 0.64 1625/2531 0.20 7/ 35
pa_low_freq 2566 0.91 2315/2531 0.46 16/ 35
cusmoke_imp : Yes 2567 0.27 686/2531 0.28 10/ 36
Female
age_int 2869 53.0 60.0 70.0
62.5 ± 10.4
62.2 77.0 86.0
74.1 ± 14.2
age_int_cat : 50-59 2869 0.48 1329/2795 0.23 17/ 74
  60-69 0.27 757/2795 0.15 11/ 74
  70-80 0.18 517/2795 0.31 23/ 74
  80+ 0.07 192/2795 0.31 23/ 74
weight 2796 60.0 68.0 77.0
69.5 ± 13.2
56.0 63.0 72.0
65.1 ± 12.0
height_imp 2848 161.00 165.00 170.00
165.42 ±  6.09
162.00 165.00 170.00
164.69 ±  6.94
education_imp : Low 2859 0.27 756/2789 0.54 38/ 70
  Medium 0.32 905/2789 0.30 21/ 70
  High 0.40 1128/2789 0.16 11/ 70
pa_vig_freq 2857 0.58 1628/2789 0.21 14/ 68
pa_low_freq 2856 0.91 2527/2788 0.41 28/ 68
cusmoke_imp : Yes 2856 0.25 684/2788 0.22 15/ 68
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.
5.2.3.6.3 Reason for missing values in the outcome

Metadata indicate that some variables in the GS module provide information about the reason for missing data in maxgrip in Wave 1. However, the information is not complete (the variable is not recorded in Wave 7 and has a large proportion of missing values in Wave 3).

In the data from Denmark missing data in the outcome are due to being unable to take the measurement for 36% of the missing values, indicating that missing values might be related to bad physical conditions; 21% refuse to take the measurement, the reason for missingness is not known in 38% of the cases.

my.tab1 <-
  table(share$gs001_,
        Missing = is.na(share$maxgrip),
        exclude = NULL)
my.tab2 <- round(prop.table(my.tab1, 2) * 100)

my.tab <- cbind(my.tab1[, 1], my.tab2[, 1], my.tab1[, 2], my.tab2[, 2])


dimnames(my.tab)[[2]] <- c("n", "%", "n", "%")

kable(my.tab, caption = "Number and percentage of participants with all missing outcomes", digits = 1) %>%  add_header_above(c(
  " " = 1,
  "Non missing outcome" = 2,
  "Missing outcome" = 2
)) %>% kable_styling()
Number and percentage of participants with all missing outcomes
Non missing outcome
Missing outcome
n % n %
R agrees to take measurement 15052 84 15 2
R refuses to take measurement 1 0 134 19
R is unable to take measurement 0 0 263 37
Proxy-interview 0 0 28 4
2876 16 263 37

For about 10% of missing outcome values the participants were unable to use one or both hands (gs002_ variable).

For most participants with missing outcome all the individual measurements (4 measurements, 2 per hand) are missing.

5.2.4 Patterns (M5)

Here we show the co-occurrence of item missingness across variables (we set the minimum set size to be displayed to 5, smaller sets can).

5.2.4.1 Co-occurrence of item missingness at baseline

p <- sharew1.baseline %>%
  select(weight,
         height_imp,
         education_imp ,
         pa_vig_freq ,
         pa_low_freq,
         cusmoke_imp,
         maxgrip)  %>%
  gg_miss_upset(nsets = 10, nintersects = 7)

p

png(
  "Figures/itemNA2.png",
  height = 7,
  width = 7,
  units = "in",
  res = 600,
  pointsize = 12
)
p
graphics.off()

There is no common pattern of missingness between variables at baseline. Most missing values appear in only one variable. Grip strength (maxgrip) is the variable with most missing values, followed by weight.

5.2.4.2 Co-occurrence of outcome missingness across measurement occasions

There was no clear association between missingness in different measuring occasions - a relatively small proportion of subjects had co-occurrence of outcome missingness in more than one occasion.

Here we use an UpSets plot, which show the number of participants that have certain variables missing together with other variables (missingness is indicated with a dot on the horizontal axes).

# number of intersections set so that the number is 5 or more, manual
p <-
  gg_miss_upset(mg.occasion.NA, nsets = num.waves, nintersects = 15)


png(
  "Figures/MG_na.png",
  height = 7,
  width = 7,
  units = "in",
  res = 600,
  pointsize = 12
)
p
graphics.off()


### figure 8 for paper
tiff(
  "figuresSubmission/fig8.tiff",
  compression = "lzw",
  height = 7,
  width = 7,
  units = "in",
  res = 600,
  pointsize = 12
)
gg_miss_upset(mg.occasion.NA, nsets = num.waves, nintersects = 15)
graphics.off()
#######

5.2.4.3 Co-occurrence of item-missingness across measurement occasions for time-varying covariates

There is no clear pattern of co-occurrence of missing values of the time varying covariates across measurement occasions. Here we did not consider as missing the variables missing by design (weight in wave 3, PA in SHARELIFE interviews). The graphs are omitted from this report.

5.2.5 Comparison of non-enrolled and target population (ME1)

Here the aim is to understand if the non-enrolled (participants that fulfill the inclusion criteria that do not participate in the study) differ from responders and how they compare to the target population.

The characteristics of non-enrolled could be studied only indirectly, comparing the samples of responders with some known characteristics of the target population (sex, age and education composition, EUROSTAT data that available from year 2007, Wave 2 of the study), as the data on non-enrolled are not provided by the SHARE study (ME1 domain).

The age, sex and education distributions of the responders were compared to those from the target population (EUROSTAT data, available from 2007, accessed in August 2022) for each of the waves. For Wave 2 and 5 we also analyzed the random refreshment samples (excluding the oversampled younger cohort, the two subsamples can be identified using the study meta-data); the comparison with the characteristics of the target population is the most straightforward analysis for studying the characteristics of the reponders, while the analysis of the full samples of responders from Waves 2 to 7 to their target population provide a mean for assessing the characteristics of non-reponders and participants lost to follow-up. For Wave 3 and 7 the target population was considered the 52+ population.

The results of all these analyses indicated that the responders that participated to the survey at least once had substantially higher education compared to the population in the same age and sex groups, the males in the younger age groups were slightly underrepresented, as were the older women.

# breaks used for age for presentation purposes,
# here we group in 5 year classes and the 85+ category

my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 100.1)

# reading and organization of population data

# import of the population data that is available
#Data downloaded from
#https://ec.europa.eu/eurostat/databrowser/view/edat_lfse_04/default/table?lang=en

tmp <-
  read.delim(here::here("Data", "demo_pjanedu__custom_3111506_tabular.tsv"))

mat.descr <-
  matrix(unlist(strsplit(tmp[, 1], ",")), nrow = nrow(tmp), byrow = TRUE)


data.pop.DK <-
  cbind.data.frame(Sex = mat.descr[, 3],
                   Age = mat.descr[, 4],
                   Education = mat.descr[, 5],
                   tmp[, -1])


# all 0 with unknown age, filter
data.pop.DK <- filter(data.pop.DK, Age != "UNK")
data.pop.DK <- filter(data.pop.DK, Age != "TOTAL")
data.pop.DK <- filter(data.pop.DK, Education != "TOTAL")
data.pop.DK <- filter(data.pop.DK, Education != "NAP")
#data.pop.DK <- filter(data.pop.DK, Age!="Y_OPEN")

data.pop.DK$Age <- as.character(data.pop.DK$Age)
#100 + category
data.pop.DK$Age[data.pop.DK$Age == "Y_OPEN"] <- "Y100"

#transforming to a number
data.pop.DK$Age <-
  as.numeric(sapply(strsplit(data.pop.DK$Age, "Y"), function(x)
    x[2]))

data.pop.DK$ISCED.cat <-
  factor(data.pop.DK$Education,
         labels = c("Low", "Medium", "High", "Unknown"))

#collapsing the data, removing education
data.pop.DK.NoEdu <-
  data.pop.DK %>% group_by(Age, Sex) %>% summarize(
    X2007 = sum(X2007),
    X2009 = sum(X2009),
    X2011 = sum(X2011),
    X2013 = sum(X2013),
    X2015 = sum(X2015),
    X2017 = sum(X2017)
  )



# Age group categories
data.pop.DK.AgeCat <-
  data.pop.DK %>% #filter(Age<85) %>% filter(ISCED.cat!="Unknown") %>%
  mutate(AgeCat = cut(Age, breaks = my.breaks, right = FALSE)) %>%
  group_by(Sex, AgeCat, ISCED.cat) %>% summarize(
    X2007 = sum(X2007),
    X2009 = sum(X2009),
    X2011 = sum(X2011),
    X2013 = sum(X2013),
    X2015 = sum(X2015),
    X2017 = sum(X2017)
    
  )

d2 <-
  data.pop.DK %>% #filter(Age<85) %>%  filter(ISCED.cat!="Unknown") %>%
  group_by(AgeCat = cut(Age, breaks = my.breaks, right = FALSE)) %>%
  summarise(
    nWave2 = sum(X2007),
    nWave3 = sum(X2009),
    nWave4 = sum(X2011),
    nWave5 = sum(X2013),
    nWave6 = sum(X2015),
    nWave7 = sum(X2017)
    
  )


data.pop.DK.AgeCat.NoEdu <-
  data.pop.DK.AgeCat %>% group_by(AgeCat, Sex) %>% summarize(
    X2007 = sum(X2007),
    X2009 = sum(X2009),
    X2011 = sum(X2011),
    X2013 = sum(X2013),
    X2015 = sum(X2015),
    X2017 = sum(X2017)
  )

d3.pop.noEdu <-
  left_join(data.pop.DK.AgeCat.NoEdu, d2, by = c("AgeCat")) %>% mutate(
    propWave2 = X2007 / nWave2,
    propWave3 = X2009 /
      nWave3,
    propWave4 = X2011 /
      nWave4,
    propWave5 = X2013 /
      nWave5,
    propWave6 = X2015 /
      nWave6,
    propWave7 = X2017 /
      nWave7
  )



#d3.pop$ISCED.cat <- factor(d3.pop$ISCED.cat, levels=c("High", "Medium", "Low"))
# relabelling and reordering the names in sex
d3.pop.noEdu$Sex <-
  factor(d3.pop.noEdu$Sex, labels = c("Female", "Male"))
d3.pop.noEdu$Sex <-
  factor(d3.pop.noEdu$Sex, levels = c("Male", "Female"))


# populaton with education

d2 <-
  data.pop.DK %>% filter(Age < 85) %>%  filter(ISCED.cat != "Unknown") %>%
  group_by(Sex, AgeCat = cut(
    Age,
    breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
    right = FALSE
  )) %>%
  summarise(
    nWave2 = sum(X2007),
    nWave3 = sum(X2009),
    nWave4 = sum(X2011),
    nWave5 = sum(X2013),
    nWave6 = sum(X2015),
    nWave7 = sum(X2017)
    
  )


d3.pop <-
  left_join(
    data.pop.DK.AgeCat %>% filter(ISCED.cat != "Unknown", AgeCat != "[85,100)"),
    d2,
    by = c("Sex", "AgeCat")
  ) %>% mutate(
    propWave2 = X2007 / nWave2,
    propWave3 = X2009 /
      nWave3,
    propWave4 = X2011 /
      nWave4,
    propWave5 = X2013 /
      nWave5,
    propWave6 = X2015 /
      nWave6,
    propWave7 = X2017 /
      nWave7
  )



d3.pop$ISCED.cat <-
  factor(d3.pop$ISCED.cat, levels = c("High", "Medium", "Low"))
# relabelling and reordering the names in sex
d3.pop$Sex <- factor(d3.pop$Sex, labels = c("Female", "Male"))
d3.pop$Sex <- factor(d3.pop$Sex, levels = c("Male", "Female"))


#graph above 85, shows that the analysis cannot be done after 85
# data sets for the population data 52+

# Age group categories
data.pop.DK.AgeCat.52 <-
  data.pop.DK %>% filter(Age >= 52) %>% #filter(Age<85) %>% filter(ISCED.cat!="Unknown") %>%
  mutate(AgeCat = cut(Age, breaks = my.breaks, right = FALSE)) %>%
  group_by(Sex, AgeCat, ISCED.cat) %>% summarize(
    X2007 = sum(X2007),
    X2009 = sum(X2009),
    X2011 = sum(X2011),
    X2013 = sum(X2013),
    X2015 = sum(X2015),
    X2017 = sum(X2017)
    
  )

d2 <-
  data.pop.DK %>% filter(Age >= 52) %>% #filter(Age<85) %>%  filter(ISCED.cat!="Unknown") %>%
  group_by(AgeCat = cut(Age, breaks = my.breaks, right = FALSE)) %>%
  summarise(
    nWave2 = sum(X2007),
    nWave3 = sum(X2009),
    nWave4 = sum(X2011),
    nWave5 = sum(X2013),
    nWave6 = sum(X2015),
    nWave7 = sum(X2017)
    
  )


data.pop.DK.AgeCat.NoEdu.52 <-
  data.pop.DK.AgeCat.52 %>% group_by(AgeCat, Sex) %>% summarize(
    X2007 = sum(X2007),
    X2009 = sum(X2009),
    X2011 = sum(X2011),
    X2013 = sum(X2013),
    X2015 = sum(X2015),
    X2017 = sum(X2017)
  )

d3.pop.noEdu.52 <-
  left_join(data.pop.DK.AgeCat.NoEdu.52, d2, by = c("AgeCat")) %>% mutate(
    propWave2 = X2007 / nWave2,
    propWave3 = X2009 /
      nWave3,
    propWave4 = X2011 /
      nWave4,
    propWave5 = X2013 /
      nWave5,
    propWave6 = X2015 /
      nWave6,
    propWave7 = X2017 /
      nWave7
  )



#d3.pop$ISCED.cat <- factor(d3.pop$ISCED.cat, levels=c("High", "Medium", "Low"))
# relabelling and reordering the names in sex
d3.pop.noEdu.52$Sex <-
  factor(d3.pop.noEdu.52$Sex, labels = c("Female", "Male"))
d3.pop.noEdu.52$Sex <-
  factor(d3.pop.noEdu.52$Sex, levels = c("Male", "Female"))


# populaton with education

d2 <-
  data.pop.DK %>% filter(Age < 85 &
                           Age >= 52) %>%  filter(ISCED.cat != "Unknown") %>%
  group_by(Sex, AgeCat = cut(
    Age,
    breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
    right = FALSE
  )) %>%
  summarise(
    nWave2 = sum(X2007),
    nWave3 = sum(X2009),
    nWave4 = sum(X2011),
    nWave5 = sum(X2013),
    nWave6 = sum(X2015),
    nWave7 = sum(X2017)
    
  )


d3.pop.52 <-
  left_join(
    data.pop.DK.AgeCat.52 %>% filter(ISCED.cat != "Unknown", AgeCat != "[85,100)"),
    d2,
    by = c("Sex", "AgeCat")
  ) %>% mutate(
    propWave2 = X2007 / nWave2,
    propWave3 = X2009 /
      nWave3,
    propWave4 = X2011 /
      nWave4,
    propWave5 = X2013 /
      nWave5,
    propWave6 = X2015 /
      nWave6,
    propWave7 = X2017 /
      nWave7
  )



d3.pop.52$ISCED.cat <-
  factor(d3.pop.52$ISCED.cat, levels = c("High", "Medium", "Low"))
# relabelling and reordering the names in sex
d3.pop.52$Sex <- factor(d3.pop.52$Sex, labels = c("Female", "Male"))
d3.pop.52$Sex <- factor(d3.pop.52$Sex, levels = c("Male", "Female"))

5.2.5.1 Non-response, using data from Wave 2 and Wave 5

Only Wave 2 and 5 provide full age samples that can be used to study the characteristics of non responders. For presentation purposes the age groups 85+ were grouped because of the small number of participants older than this age. Population data about education are mostly missing for individuals older than 85 in 2007, therefore the analyses about education are restricted to this age group.

5.2.5.1.1 Wave 2

Here we restrict the attention to the random refreshment sample from Wave 2 that responded to the interview and compare it to the target population in terms of age, sex and education.

# Wave 2, random sample labelled as DK-S3
my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 100.1)


d3.sample <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S3" &
                                                 Wave == "Wave 2") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE), gender, Wave) %>% summarize(num =
                                                                                                n())

num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S3" &
                                                 Wave == "Wave 2") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE),  Wave) %>% summarize(numTot =
                                                                                         n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age",  "Wave")) %>% mutate(propWave = num /
                                                                     numTot)

The analysis included 1084 participants from the random sample Wave 2.

Sex

g2s <-
  ggplot(d3.sample, aes(
    x = Age,
    y = propWave,
    label =  round(propWave, 2),
    fill = gender
  )) +  geom_bar(aes(fill = gender, group = gender), stat = "identity") +  labs(y =
                                                                                  "Proportion", title = "Responders (2007, r.s.)", fill = "Sex") + geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(. ~
                                                                                                                                                                                                                               .) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45))
  theme(legend.position = "bottom")  + theme_classic() + geom_text(
    data = d3.sample,
    aes(label = paste("n=", numTot), y = Inf),
    vjust = 1,
    size = 3
  )

g2s

g2p <-
  ggplot(d3.pop.noEdu,
         aes(
           x = AgeCat,
           y = propWave2,
           label =  round(propWave2, 2),
           fill = Sex
         )) + facet_grid(. ~ .) + geom_bar(aes(fill = Sex, group = Sex), stat = "identity") +  labs(
           title = "Population (2007)",
           fill = "Sex",
           group = "Sex",
           y = "Proportion",
           x = "Age group"
         ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme(legend.position = "bottom") + theme_classic()


g2p

# alternative graphical display

d3.sample <-
  d3.sample %>% mutate(
    upperE = propWave + 1.96 * sqrt((num / numTot) * (1 - (num / numTot)) /
                                      numTot),
    lowerE = propWave - 1.96 * sqrt((num /
                                       numTot) * (1 - (num / numTot)) / numTot)
  )




g2s.points <-
  ggplot(
    left_join(
      d3.pop.noEdu,
      mutate(d3.sample, AgeCat = Age, Sex = gender),
      by = c("AgeCat", "Sex")
    ) %>%  filter(Sex == "Female"),
    aes(AgeCat, propWave2)
  ) + geom_point(col = "red", size = 3) + geom_point(aes(y = propWave), col =
                                                       1) + theme_bw() + geom_errorbar(aes(ymin = lowerE, ymax = upperE)) +
  labs(x = "Age group", y = "Proportion of females in age group", title = "Denmark 2007 / Wave 2 - population (red) and \n responders (r.s., black with 95% CI)")


g2s.points

ggsave(
  "Figures/sexW2rs.png",
  plot = g2s.points,
  scale = 1.2,
  width = 5,
  height = 5
)

The last figure shows the proportion of females in the population (red dots) and in the sample (black dots, with 95% confidence intervals).

The distribution of sex in the sample and in the population is similar, a deviation can be observed in the younger age group, where females in the sample are overrepresented

Age

Females in the SHARE are underrepresented in the age group 80 and above compared to the population of females in Denmark as shown in the scatterplot.

tmp.data <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S3" &
                                                 Wave == "Wave 2")
par(mfrow = c(1, 2))

qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2007[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]),
  tmp.data$age_int[tmp.data$gender == "Female"],
  xlab = "Age in the population (F)",
  ylab = "Age in the sample (F)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)


qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2007[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]),
  tmp.data$age_int[tmp.data$gender == "Male"],
  xlab = "Age in the population (M)",
  ylab = "Age in the sample (M)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)

my.mat <-
  rbind.data.frame(
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2007[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Female"]),
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2007[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Male"])
  )

dimnames(my.mat)[[1]] <-
  c("Population females",
    "Sample W2 females",
    "Population males",
    "Sample W2 males")
dimnames(my.mat)[[2]] <-
  c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age, population 2007 vs random sample in Wave 2") %>% kable_styling()
Distribution of age, population 2007 vs random sample in Wave 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
Population females 50 57 63 65.7 74 100
Sample W2 females 50 56 63 64.7 72 98
Population males 50 56 62 63.7 70 100
Sample W2 males 50 57 63 64.4 70 92

Education

The lower educated individuals are underrepresented in the sample. The differences between the sample and the population seem present in all age and sex groups.

d3.sample <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S3" &
                                                 Wave == "Wave 2") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>% group_by(Age =
                                                                                                                                              cut(
                                                                                                                                                age_int,
                                                                                                                                                breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
                                                                                                                                                right = FALSE
                                                                                                                                              ),
                                                                                                                                            gender,
                                                                                                                                            education_imp,
                                                                                                                                            Wave) %>% summarize(num = n())


num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S3" &
                                                 Wave == "Wave 2") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>% group_by(Age =
                                                                                                                                              cut(
                                                                                                                                                age_int,
                                                                                                                                                breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
                                                                                                                                                right = FALSE
                                                                                                                                              ), gender, Wave) %>% summarize(numTot = n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age", "gender", "Wave")) %>% mutate(propWave = num /
                                                                              numTot)

d3.sample$education_imp <-
  factor(d3.sample$education_imp, levels = c("High", "Medium", "Low"))

# graph of the distribution of the education in the random sample from wave 2, stratified by sex

g2s <-
  ggplot(d3.sample,
         aes(
           x = Age,
           y = propWave,
           label =  round(propWave, 2),
           fill = education_imp
         )) +  geom_bar(aes(fill = education_imp, group = education_imp), stat =
                          "identity") +  #labs(y="Proportion", title="Distribution of education in the random sample from Wave 2 (2007)", fill="Education", group="Education", x="Age group")
  labs(
    y = "Proportion",
    title = paste0(
      "Responders (2007)\n (Wave2, random sample, n=",
      sum(d3.sample$num),
      ")"
    ),
    fill = "Education",
    group = "Education",
    x = "Age group"
  ) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(gender ~
                                                                              .) + #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")

# graph of the distribution of the education in the population, stratified by sex

g2p <-
  ggplot(d3.pop,
         aes(
           x = AgeCat,
           y = propWave2,
           label =  round(propWave2, 2),
           fill = ISCED.cat
         )) + facet_grid(Sex ~ .) + geom_bar(aes(fill = ISCED.cat, group = ISCED.cat), stat =
                                               "identity") +
  labs(
    title = "Population (2007)",
    fill = "Education",
    group = "Education",
    y = "Proportion",
    x = "Age group"
  ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")

# displaying graphs
g2s

g2p

# saving the graphs

g2p <- g2p + theme(legend.position = "bottom")
g2s <- g2s + theme(legend.position = "none") + labs(y = "")

g <- g2p + g2s
#g # does not fit in the output, displaying separately
ggsave(
  "Figures/educationW2rs.png",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5
)


### figure 4 for paper

ggsave(
  "FiguresSubmission/Fig4.tif",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5,
  device = "tiff",
  dpi = 600,
  compression = "lzw"
)

#####
5.2.5.1.2 Wave 5

Here we restrict the attention to the random refreshment sample from Wave 5 that responded to the interview and compare it to the target population in terms of age, sex and education.

# Wave 5, random sample labelled as DK-S6

my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 100.1)


d3.sample <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S6" &
                                                 Wave == "Wave 5") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE), gender, Wave) %>% summarize(num =
                                                                                                n())

num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S6" &
                                                 Wave == "Wave 5") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE),  Wave) %>% summarize(numTot =
                                                                                         n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age",  "Wave")) %>% mutate(propWave = num /
                                                                     numTot)

The analysis included 1629 participants from the random sample Wave 5.

Sex

g2s <-
  ggplot(d3.sample, aes(
    x = Age,
    y = propWave,
    label =  round(propWave, 2),
    fill = gender
  )) +  geom_bar(aes(fill = gender, group = gender), stat = "identity") +  labs(y =
                                                                                  "Proportion", title = "Responders (2013, r.s.)", fill = "Sex") + geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(. ~
                                                                                                                                                                                                                               .) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45))
  theme(legend.position = "bottom")  + theme_classic() + geom_text(
    data = d3.sample,
    aes(label = paste("n=", numTot), y = Inf),
    vjust = 1,
    size = 3
  )

g2p <-
  ggplot(d3.pop.noEdu,
         aes(
           x = AgeCat,
           y = propWave5,
           label =  round(propWave5, 2),
           fill = Sex
         )) + facet_grid(. ~ .) + geom_bar(aes(fill = Sex, group = Sex), stat = "identity") +  labs(
           title = "Population (2013)",
           fill = "Sex",
           group = "Sex",
           y = "Proportion",
           x = "Age group"
         ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme(legend.position = "bottom") + theme_classic()

g2s

g2p

# alternative graphical display

d3.sample <-
  d3.sample %>% mutate(
    upperE = propWave + 1.96 * sqrt((num / numTot) * (1 - (num / numTot)) /
                                      numTot),
    lowerE = propWave - 1.96 * sqrt((num /
                                       numTot) * (1 - (num / numTot)) / numTot)
  )


g2s.points <-
  ggplot(
    left_join(
      d3.pop.noEdu,
      mutate(d3.sample, AgeCat = Age, Sex = gender),
      by = c("AgeCat", "Sex")
    ) %>%  filter(Sex == "Female"),
    aes(AgeCat, propWave5)
  ) + geom_point(col = "red", size = 3) + geom_point(aes(y = propWave), col =
                                                       1) + theme_bw() + geom_errorbar(aes(ymin = lowerE, ymax = upperE)) +
  labs(x = "Age group", y = "Proportion of females in age group", title = "Denmark 2013 / Wave 5 - population (red) and \n responders (r.s., black with 95% CI)")


g2s.points

ggsave(
  "Figures/sexW5rs.png",
  plot = g2s.points,
  scale = 1.2,
  width = 5,
  height = 5
)

The distribution of sex differs between sample and population differs more than in wave 2, males in the younger age groups are more underrepresented.

Age

The older females are somehow underrepresented in the sample compared to the population, as are the younger men.

tmp.data <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S6" &
                                                 Wave == "Wave 5")
par(mfrow = c(1, 2))

qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]),
  tmp.data$age_int[tmp.data$gender == "Female"],
  xlab = "Age in the population (F)",
  ylab = "Age in the sample (F)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)


qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]),
  tmp.data$age_int[tmp.data$gender == "Male"],
  xlab = "Age in the population (M)",
  ylab = "Age in the sample (M)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)

my.mat <-
  rbind.data.frame(
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Female"]),
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Male"])
  )

dimnames(my.mat)[[1]] <-
  c("Population females",
    "Sample W5 females",
    "Population males",
    "Sample W5 males")
dimnames(my.mat)[[2]] <-
  c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age, population 2013 vs random sample in Wave 5") %>% kable_styling()
Distribution of age, population 2013 vs random sample in Wave 5
Min. 1st Qu. Median Mean 3rd Qu. Max.
Population females 50 57 65 66.0 73 100
Sample W5 females 50 58 64 65.4 72 100
Population males 50 56 63 64.3 71 100
Sample W5 males 50 58 66 66.1 72 98

Education

The lower educated individuals are underrepresented in the sample. The differences between the sample and the population seem present in all age and sex groups - only older females have a similar distribution of education in sample and population.

d3.sample <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S6" &
                                                 Wave == "Wave 5") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>% group_by(Age =
                                                                                                                                              cut(
                                                                                                                                                age_int,
                                                                                                                                                breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
                                                                                                                                                right = FALSE
                                                                                                                                              ),
                                                                                                                                            gender,
                                                                                                                                            education_imp,
                                                                                                                                            Wave) %>% summarize(num = n())


num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(subsample == "DK-S6" &
                                                 Wave == "Wave 5") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>% group_by(Age =
                                                                                                                                              cut(
                                                                                                                                                age_int,
                                                                                                                                                breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
                                                                                                                                                right = FALSE
                                                                                                                                              ), gender, Wave) %>% summarize(numTot = n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age", "gender", "Wave")) %>% mutate(propWave = num /
                                                                              numTot)

d3.sample$education_imp <-
  factor(d3.sample$education_imp, levels = c("High", "Medium", "Low"))


# graph of the distribution of the education in the random sample from Wave 5, stratified by sex

g2s <-
  ggplot(d3.sample,
         aes(
           x = Age,
           y = propWave,
           label =  round(propWave, 2),
           fill = education_imp
         )) +  geom_bar(aes(fill = education_imp, group = education_imp), stat =
                          "identity") +  #labs(y="Proportion", title="Distribution of education in the random sample from Wave 5 (2007)", fill="Education", group="Education", x="Age group")
  labs(
    y = "Proportion",
    title = "Responders (2013, r.s.)",
    fill = "Education",
    group = "Education",
    x = "Age group"
  ) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(gender ~
                                                                              .) + #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")

# graph of the distribution of the education in the population, stratified by sex

g2p <-
  ggplot(d3.pop,
         aes(
           x = AgeCat,
           y = propWave5,
           label =  round(propWave5, 2),
           fill = ISCED.cat
         )) + facet_grid(Sex ~ .) + geom_bar(aes(fill = ISCED.cat, group = ISCED.cat), stat =
                                               "identity") +
  labs(
    title = "Population (2013)",
    fill = "Education",
    group = "Education",
    y = "Proportion",
    x = "Age group"
  ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() +  theme(legend.position = "none")

# displaying graphs
g2s

g2p

# saving the graphs

g2p <- g2p + theme(legend.position = "bottom")
g2s <- g2s + theme(legend.position = "none") + labs(y = "")

g <- g2p + g2s
#g # does not fit in the output, displaying separately
ggsave(
  "Figures/educationW5rs.png",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5
)

5.2.5.2 Non-response and loss to follow-up

Here we compare the observed samples (all but Wave 1) with the population values.

Wave 2, all participants

All respondents from wave 2 vs population

# Wave 2, all participants
my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 100.1)


d3.sample <- filter(share1, Wave == "Wave 2") %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE), gender, Wave) %>% summarize(num =
                                                                                                n())

num.tot <-
  share1 %>%  filter(Wave == "Wave 2") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE),  Wave) %>% summarize(numTot =
                                                                                         n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age",  "Wave")) %>% mutate(propWave = num /
                                                                     numTot)

The analysis included 2487 participants from the random sample Wave 2.

Sex

g2s <-
  ggplot(d3.sample, aes(
    x = Age,
    y = propWave,
    label =  round(propWave, 2),
    fill = gender
  )) +  geom_bar(aes(fill = gender, group = gender), stat = "identity") +  labs(y =
                                                                                  "Proportion", title = "Responders (2007)", fill = "Sex") + geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(. ~
                                                                                                                                                                                                                         .) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45))
  theme(legend.position = "bottom")  + theme_classic() + geom_text(
    data = d3.sample,
    aes(label = paste("n=", numTot), y = Inf),
    vjust = 1,
    size = 3
  )

g2p <-
  ggplot(d3.pop.noEdu,
         aes(
           x = AgeCat,
           y = propWave2,
           label =  round(propWave2, 2),
           fill = Sex
         )) + facet_grid(. ~ .) + geom_bar(aes(fill = Sex, group = Sex), stat = "identity") +  labs(
           title = "Population (2007)",
           fill = "Sex",
           group = "Sex",
           y = "Proportion",
           x = "Age group"
         ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme(legend.position = "bottom") + theme_classic()

g2s

g2p

# alternative graphical display

d3.sample <-
  d3.sample %>% mutate(
    upperE = propWave + 1.96 * sqrt((num / numTot) * (1 - (num / numTot)) /
                                      numTot),
    lowerE = propWave - 1.96 * sqrt((num /
                                       numTot) * (1 - (num / numTot)) / numTot)
  )




g2s.points <-
  ggplot(
    left_join(
      d3.pop.noEdu,
      mutate(d3.sample, AgeCat = Age, Sex = gender),
      by = c("AgeCat", "Sex")
    ) %>%  filter(Sex == "Female"),
    aes(AgeCat, propWave2)
  ) + geom_point(col = "red", size = 3) + geom_point(aes(y = propWave), col =
                                                       1) + theme_bw() + geom_errorbar(aes(ymin = lowerE, ymax = upperE)) +
  labs(x = "Age group", y = "Proportion of females in age group", title = "Denmark 2007 / Wave 2 - population (red) and \n responders (black with 95% CI)")


g2s.points

ggsave(
  "Figures/sexW2All.png",
  plot = g2s.points,
  scale = 1.2,
  width = 5,
  height = 5
)

The distribution of sex differs between sample and population, males in the younger age groups are more underrepresented.

Age

The older females are somehow underrepresented in the sample compared to the population, as are the younger men.

tmp.data <- share1 %>% filter(Wave == "Wave 2")
par(mfrow = c(1, 2))

qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]),
  tmp.data$age_int[tmp.data$gender == "Female"],
  xlab = "Age in the population (F)",
  ylab = "Age in the sample (F)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)


qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]),
  tmp.data$age_int[tmp.data$gender == "Male"],
  xlab = "Age in the population (M)",
  ylab = "Age in the sample (M)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)

my.mat <-
  rbind.data.frame(
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Female"]),
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Male"])
  )

dimnames(my.mat)[[1]] <-
  c("Population females",
    "Sample W2 females",
    "Population males",
    "Sample W2 males")
dimnames(my.mat)[[2]] <-
  c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age, population 2007 vs random sample in Wave 2") %>% kable_styling()
Distribution of age, population 2007 vs random sample in Wave 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
Population females 50 57 65 66.0 73 100
Sample W2 females 50 56 63 65.1 73 99
Population males 50 56 63 64.3 71 100
Sample W2 males 50 56 62 63.9 70 92

Education

The lower educated individuals are underrepresented in the sample. The differences between the sample and the population seem present in all age and sex groups - only older females have a similar distribution of education in sample and population.

Check here the output

d3.sample <-
  share1 %>% filter(Wave == "Wave 2") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>% group_by(Age =
                                                                                                                 cut(
                                                                                                                   age_int,
                                                                                                                   breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
                                                                                                                   right = FALSE
                                                                                                                 ),
                                                                                                               gender,
                                                                                                               education_imp,
                                                                                                               Wave) %>% summarize(num = n())


num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(Wave == "Wave 2") %>% filter(age_int <
                                                                              85) %>%  filter(!is.na(education_imp)) %>% group_by(Age = cut(
                                                                                age_int,
                                                                                breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
                                                                                right = FALSE
                                                                              ), gender, Wave) %>% summarize(numTot = n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age", "gender", "Wave")) %>% mutate(propWave = num /
                                                                              numTot)

d3.sample$education_imp <-
  factor(d3.sample$education_imp, levels = c("High", "Medium", "Low"))

# graph of the distribution of the education in the random sample from Wave 5, stratified by sex

g2s <-
  ggplot(d3.sample,
         aes(
           x = Age,
           y = propWave,
           label =  round(propWave, 2),
           fill = education_imp
         )) +  geom_bar(aes(fill = education_imp, group = education_imp), stat =
                          "identity") +  #labs(y="Proportion", title="Distribution of education in the random sample from Wave 5 (2007)", fill="Education", group="Education", x="Age group")
  labs(
    y = "Proportion",
    title = "Responders (2007)",
    fill = "Education",
    group = "Education",
    x = "Age group"
  ) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(gender ~
                                                                              .) + #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  #theme(legend.position ="bottom")  +
  theme_classic() + theme(legend.position = "none")


# graph of the distribution of the education in the population, stratified by sex

g2p <-
  ggplot(d3.pop,
         aes(
           x = AgeCat,
           y = propWave2,
           label =  round(propWave2, 2),
           fill = ISCED.cat
         )) + facet_grid(Sex ~ .) + geom_bar(aes(fill = ISCED.cat, group = ISCED.cat), stat =
                                               "identity") +
  labs(
    title = "Population (2007)",
    fill = "Education",
    group = "Education",
    y = "Proportion",
    x = "Age group"
  ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "bottom")

# displaying graphs
g2s

g2p

# saving the graphs

g2p <- g2p + theme(legend.position = "bottom")
g2s <- g2s + theme(legend.position = "none") + labs(y = "")

g <- g2p + g2s
#g # does not fit in the output, displaying separately
ggsave(
  "Figures/educationW2All.png",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5
)

Wave 3, all participants

All respondents from wave 3 vs population of 52+ (no refreshment samples in wave 3). (The labels of the younger age group indicate 50-55 but refer to 52-55).

# Wave 3, all participants
my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 100.1)


d3.sample <- filter(share1, Wave == "Wave 3") %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE), gender, Wave) %>% summarize(num =
                                                                                                n())

num.tot <-
  share1 %>%  filter(Wave == "Wave 3") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE),  Wave) %>% summarize(numTot =
                                                                                         n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age",  "Wave")) %>% mutate(propWave = num /
                                                                     numTot)

The analysis included 1979 participants from the random sample Wave 3.

Sex

g2s <-
  ggplot(d3.sample, aes(
    x = Age,
    y = propWave,
    label =  round(propWave, 2),
    fill = gender
  )) +  geom_bar(aes(fill = gender, group = gender), stat = "identity") +  labs(y =
                                                                                  "Proportion", title = "Responders (2009)", fill = "Sex") + geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(. ~
                                                                                                                                                                                                                         .) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45))
  theme(legend.position = "bottom")  + theme_classic() + geom_text(
    data = d3.sample,
    aes(label = paste("n=", numTot), y = Inf),
    vjust = 1,
    size = 3
  )

g2p <-
  ggplot(d3.pop.noEdu.52,
         aes(
           x = AgeCat,
           y = propWave3,
           label =  round(propWave3, 2),
           fill = Sex
         )) + facet_grid(. ~ .) + geom_bar(aes(fill = Sex, group = Sex), stat = "identity") +  labs(
           title = "Population (2009)",
           fill = "Sex",
           group = "Sex",
           y = "Proportion",
           x = "Age group"
         ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme(legend.position = "bottom") + theme_classic()

g2s

g2p

# alternative graphical display

d3.sample <-
  d3.sample %>% mutate(
    upperE = propWave + 1.96 * sqrt((num / numTot) * (1 - (num / numTot)) /
                                      numTot),
    lowerE = propWave - 1.96 * sqrt((num /
                                       numTot) * (1 - (num / numTot)) / numTot)
  )

g2s.points <-
  ggplot(
    left_join(
      d3.pop.noEdu.52,
      mutate(d3.sample, AgeCat = Age, Sex = gender),
      by = c("AgeCat", "Sex")
    ) %>%  filter(Sex == "Female"),
    aes(AgeCat, propWave3)
  ) + geom_point(col = "red", size = 3) + geom_point(aes(y = propWave), col =
                                                       1) + theme_bw() + geom_errorbar(aes(ymin = lowerE, ymax = upperE)) +
  labs(x = "Age group", y = "Proportion of females in age group", title = "Denmark 2009 / Wave 3 - population (red) and \n responders (black with 95% CI)")


g2s.points

ggsave(
  "Figures/sexW3All.png",
  plot = g2s.points,
  scale = 1.2,
  width = 5,
  height = 5
)

As in the other waves, males in the younger age groups are underrepresented.

Age

The older females are somehow underrepresented in the sample compared to the population, as are the younger men.

tmp.data <- share1 %>% filter(Wave == "Wave 3")
par(mfrow = c(1, 2))

qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F" &
                              data.pop.DK.NoEdu$Age >= 52], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                      "F" &
                                                                                      data.pop.DK.NoEdu$Age >= 52]),
  tmp.data$age_int[tmp.data$gender == "Female"],
  xlab = "Age in the population (F)",
  ylab = "Age in the sample (F)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)


qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M" &
                              data.pop.DK.NoEdu$Age >= 52], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                      "M" &
                                                                                      data.pop.DK.NoEdu$Age >= 52]),
  tmp.data$age_int[tmp.data$gender == "Male"],
  xlab = "Age in the population (M)",
  ylab = "Age in the sample (M)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)

my.mat <-
  rbind.data.frame(
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F" &
                              data.pop.DK.NoEdu$Age >= 52], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                      "F" & data.pop.DK.NoEdu$Age >= 52]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Female"]),
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Male"])
  )

dimnames(my.mat)[[1]] <-
  c("Population females",
    "Sample W3 females",
    "Population males",
    "Sample W3 males")
dimnames(my.mat)[[2]] <-
  c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age, population 2009 vs random sample in Wave 3") %>% kable_styling()
Distribution of age, population 2009 vs random sample in Wave 3
Min. 1st Qu. Median Mean 3rd Qu. Max.
Population females 52 59 66 67.1 74 100
Sample W3 females 51 58 64 66.3 74 97
Population males 50 56 63 64.3 71 100
Sample W3 males 51 58 64 65.2 71 94

Education

The lower educated individuals are underrepresented in the sample. The differences between the sample and the population seem present in all age and sex groups - only older females have a similar distribution of education in sample and population.

d3.sample <-
  share1 %>% filter(Wave == "Wave 3") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age =
             cut(
               age_int,
               breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
               right = FALSE
             ),
           gender,
           education_imp,
           Wave) %>% summarize(num = n())


num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(Wave == "Wave 3") %>% filter(age_int <
                                                                              85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(
    age_int,
    breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
    right = FALSE
  ), gender, Wave) %>% summarize(numTot = n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age", "gender", "Wave")) %>% mutate(propWave = num /
                                                                              numTot)

d3.sample$education_imp <-
  factor(d3.sample$education_imp, levels = c("High", "Medium", "Low"))

# graph of the distribution of the education in the random sample from Wave 5, stratified by sex

g2s <-
  ggplot(d3.sample,
         aes(
           x = Age,
           y = propWave,
           label =  round(propWave, 2),
           fill = education_imp
         )) +  geom_bar(aes(fill = education_imp, group = education_imp), stat =
                          "identity") +  #labs(y="Proportion", title="Distribution of education in the random sample from Wave 5 (2007)", fill="Education", group="Education", x="Age group")
  labs(
    y = "Proportion",
    title = "Responders (2009)",
    fill = "Education",
    group = "Education",
    x = "Age group"
  ) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(gender ~
                                                                              .) + #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  #theme(legend.position ="bottom")
  theme_classic()

# graph of the distribution of the education in the population, stratified by sex

g2p <-
  ggplot(d3.pop.52,
         aes(
           x = AgeCat,
           y = propWave3,
           label =  round(propWave3, 2),
           fill = ISCED.cat
         )) + facet_grid(Sex ~ .) + geom_bar(aes(fill = ISCED.cat, group = ISCED.cat), stat =
                                               "identity") +
  labs(
    title = "Population (2009)",
    fill = "Education",
    group = "Education",
    y = "Proportion",
    x = "Age group"
  ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  # theme(legend.position ="bottom") +
  theme_classic()

# displaying graphs
g2s

g2p

# saving the graphs

g2p <- g2p + theme(legend.position = "bottom")
g2s <- g2s + theme(legend.position = "none") + labs(y = "")

g <- g2p + g2s
#g # does not fit in the output, displaying separately
ggsave(
  "Figures/educationW3All.png",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5
)

Wave 4, all participants

All respondents from wave 4 vs population

# Wave 4, all participants
my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 100.1)


d3.sample <- filter(share1, Wave == "Wave 4") %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE), gender, Wave) %>% summarize(num =
                                                                                                n())

num.tot <-
  share1 %>%  filter(Wave == "Wave 4") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE),  Wave) %>% summarize(numTot =
                                                                                         n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age",  "Wave")) %>% mutate(propWave = num /
                                                                     numTot)

The analysis included 2112 participants from the random sample Wave 4.

Sex

g2s <-
  ggplot(d3.sample, aes(
    x = Age,
    y = propWave,
    label =  round(propWave, 2),
    fill = gender
  )) +  geom_bar(aes(fill = gender, group = gender), stat = "identity") +  labs(y =
                                                                                  "Proportion", title = "Responders (2011)", fill = "Sex") + geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(. ~ .) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45))
  theme(legend.position = "bottom")  + theme_classic() + geom_text(
    data = d3.sample,
    aes(label = paste("n=", numTot), y = Inf),
    vjust = 1,
    size = 3
  )

g2p <-
  ggplot(d3.pop.noEdu,
         aes(
           x = AgeCat,
           y = propWave4,
           label =  round(propWave4, 2),
           fill = Sex
         )) + facet_grid(. ~ .) + geom_bar(aes(fill = Sex, group = Sex), stat = "identity") +  labs(
           title = "Population (2011)",
           fill = "Sex",
           group = "Sex",
           y = "Proportion",
           x = "Age group"
         ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme(legend.position = "bottom") + theme_classic()

g2s

g2p

# alternative graphical display

d3.sample <-
  d3.sample %>% mutate(
    upperE = propWave + 1.96 * sqrt((num / numTot) * (1 - (num / numTot)) /
                                      numTot),
    lowerE = propWave - 1.96 * sqrt((num /
                                       numTot) * (1 - (num / numTot)) / numTot)
  )




g2s.points <-
  ggplot(
    left_join(
      d3.pop.noEdu,
      mutate(d3.sample, AgeCat = Age, Sex = gender),
      by = c("AgeCat", "Sex")
    ) %>%  filter(Sex == "Female"),
    aes(AgeCat, propWave4)
  ) + geom_point(col = "red", size = 3) + geom_point(aes(y = propWave), col =
                                                       1) + theme_bw() + geom_errorbar(aes(ymin = lowerE, ymax = upperE)) +
  labs(x = "Age group", y = "Proportion of females in age group", title = "Denmark 2011 / Wave 4 - population (red) and \n responders (black with 95% CI)")


g2s.points

ggsave(
  "Figures/sexW4All.png",
  plot = g2s.points,
  scale = 1.2,
  width = 5,
  height = 5
)

The distribution of sex differs between sample and population differs more than in Wave 4, males in the younger age groups are more underrepresented.

Age

The older females are somehow underrepresented in the sample compared to the population, as are the younger men.

tmp.data <- share1 %>% filter(Wave == "Wave 4")
par(mfrow = c(1, 2))

qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]),
  tmp.data$age_int[tmp.data$gender == "Female"],
  xlab = "Age in the population (F)",
  ylab = "Age in the sample (F)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)


qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]),
  tmp.data$age_int[tmp.data$gender == "Male"],
  xlab = "Age in the population (M)",
  ylab = "Age in the sample (M)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)

my.mat <-
  rbind.data.frame(
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Female"]),
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Male"])
  )

dimnames(my.mat)[[1]] <-
  c("Population females",
    "Sample W4 females",
    "Population males",
    "Sample W4 males")
dimnames(my.mat)[[2]] <-
  c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age, population 2011 vs random sample in Wave 4") %>% kable_styling()
Distribution of age, population 2011 vs random sample in Wave 4
Min. 1st Qu. Median Mean 3rd Qu. Max.
Population females 50 57 65 66.0 73 100
Sample W4 females 50 57 64 65.6 73 99
Population males 50 56 63 64.3 71 100
Sample W4 males 50 57 63 64.5 71 96

Education

The lower educated individuals are underrepresented in the sample. The differences between the sample and the population seem present in all age and sex groups - only older females have a similar distribution of education in sample and population.

d3.sample <-
  share1 %>% filter(Wave == "Wave 4") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age =
             cut(
               age_int,
               breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
               right = FALSE
             ),
           gender,
           education_imp,
           Wave) %>% summarize(num = n())


num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(Wave == "Wave 4") %>% filter(age_int <
                                                                              85) %>%
  filter(!is.na(education_imp)) %>% group_by(Age = cut(
    age_int,
    breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
    right = FALSE
  ), gender, Wave) %>% summarize(numTot = n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age", "gender", "Wave")) %>%
  mutate(propWave = num /
           numTot)

d3.sample$education_imp <-
  factor(d3.sample$education_imp, levels = c("High", "Medium", "Low"))
# graph of the distribution of the education in the random sample from Wave 5, stratified by sex

g2s <-
  ggplot(d3.sample,
         aes(
           x = Age,
           y = propWave,
           label =  round(propWave, 2),
           fill = education_imp
         )) +  geom_bar(aes(fill = education_imp, group = education_imp), stat =
                          "identity") +  #labs(y="Proportion", title="Distribution of education in the random sample from Wave 4 (2011)", fill="Education", group="Education", x="Age group")
  labs(
    y = "Proportion",
    title = "Responders (2011)",
    fill = "Education",
    group = "Education",
    x = "Age group"
  ) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(gender ~
                                                                              .) + #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")


# graph of the distribution of the education in the population, stratified by sex

g2p <-
  ggplot(d3.pop,
         aes(
           x = AgeCat,
           y = propWave4,
           label =  round(propWave4, 2),
           fill = ISCED.cat
         )) + facet_grid(Sex ~ .) + geom_bar(aes(fill = ISCED.cat, group = ISCED.cat), stat =
                                               "identity") +
  labs(
    title = "Population (2011)",
    fill = "Education",
    group = "Education",
    y = "Proportion",
    x = "Age group"
  ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  # +
  theme_classic() + theme(legend.position = "none")

# displaying graphs
g2s

g2p

# saving the graphs

g2p <- g2p + theme(legend.position = "bottom")
g2s <- g2s + theme(legend.position = "none") + labs(y = "")

g <- g2p + g2s
#g # does not fit in the output, displaying separately
ggsave(
  "Figures/educationW4All.png",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5
)

Wave 5, all participants

All respondents from Wave5 vs population

# Wave 5, all participants

my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 100.1)


d3.sample <- filter(share1, Wave == "Wave 5") %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE), gender, Wave) %>% summarize(num =
                                                                                                n())

num.tot <-
  share1 %>%  filter(Wave == "Wave 5") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE),  Wave) %>% summarize(numTot =
                                                                                         n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age",  "Wave")) %>% mutate(propWave = num /
                                                                     numTot)

The analysis included 3919 participants from the random sample Wave 5.

Sex

g2s <-
  ggplot(d3.sample, aes(
    x = Age,
    y = propWave,
    label =  round(propWave, 2),
    fill = gender
  )) +  geom_bar(aes(fill = gender, group = gender), stat = "identity") +  labs(y =
                                                                                  "Proportion", title = "Responders (2013)", fill = "Sex") + geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(. ~
                                                                                                                                                                                                                         .) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45))
  theme(legend.position = "bottom")  + theme_classic() + geom_text(
    data = d3.sample,
    aes(label = paste("n=", numTot), y = Inf),
    vjust = 1,
    size = 3
  )

g2p <-
  ggplot(d3.pop.noEdu,
         aes(
           x = AgeCat,
           y = propWave5,
           label =  round(propWave5, 2),
           fill = Sex
         )) + facet_grid(. ~ .) + geom_bar(aes(fill = Sex, group = Sex), stat = "identity") +  labs(
           title = "Population (2013)",
           fill = "Sex",
           group = "Sex",
           y = "Proportion",
           x = "Age group"
         ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme(legend.position = "bottom") + theme_classic()


g2s

g2p

# alternative graphical display

d3.sample <-
  d3.sample %>% mutate(
    upperE = propWave + 1.96 * sqrt((num / numTot) * (1 - (num / numTot)) /
                                      numTot),
    lowerE = propWave - 1.96 * sqrt((num /
                                       numTot) * (1 - (num / numTot)) / numTot)
  )




g2s.points <-
  ggplot(
    left_join(
      d3.pop.noEdu,
      mutate(d3.sample, AgeCat = Age, Sex = gender),
      by = c("AgeCat", "Sex")
    ) %>%  filter(Sex == "Female"),
    aes(AgeCat, propWave5)
  ) + geom_point(col = "red", size = 3) + geom_point(aes(y = propWave), col =
                                                       1) + theme_bw() + geom_errorbar(aes(ymin = lowerE, ymax = upperE)) +
  labs(x = "Age group", y = "Proportion of females in age group", title = "Denmark 2013 / Wave 5 - population (red) and \n responders (black with 95% CI)")


g2s.points

ggsave(
  "Figures/sexW5All.png",
  plot = g2s.points,
  scale = 1.2,
  width = 5,
  height = 5
)

The distribution of sex differs between sample and population differs more than in Wave 5, males in the younger age groups are more underrepresented.

Age

The older females are somehow underrepresented in the sample compared to the population, as are the younger men.

tmp.data <- share1 %>% filter(Wave == "Wave 5")
par(mfrow = c(1, 2))

qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]),
  tmp.data$age_int[tmp.data$gender == "Female"],
  xlab = "Age in the population (F)",
  ylab = "Age in the sample (F)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)


qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]),
  tmp.data$age_int[tmp.data$gender == "Male"],
  xlab = "Age in the population (M)",
  ylab = "Age in the sample (M)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)

my.mat <-
  rbind.data.frame(
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Female"]),
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Male"])
  )

dimnames(my.mat)[[1]] <-
  c("Population females",
    "Sample W5 females",
    "Population males",
    "Sample W5 males")
dimnames(my.mat)[[2]] <-
  c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age, population 2013 vs random sample in Wave 5") %>% kable_styling()
Distribution of age, population 2013 vs random sample in Wave 5
Min. 1st Qu. Median Mean 3rd Qu. Max.
Population females 50 57 65 66.0 73 100
Sample W5 females 50 57 64 65.5 72 100
Population males 50 56 63 64.3 71 100
Sample W5 males 50 57 64 65.3 72 98

Education

The lower educated individuals are underrepresented in the sample. The differences between the sample and the population seem present in all age and sex groups - only older females have a similar distribution of education in sample and population.

d3.sample <-
  share1 %>% filter(Wave == "Wave 5") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age =
             cut(
               age_int,
               breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
               right = FALSE
             ),
           gender,
           education_imp,
           Wave) %>% summarize(num = n())



num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(Wave == "Wave 5") %>%
  filter(age_int <
           85) %>%  filter(!is.na(education_imp)) %>% group_by(Age = cut(
             age_int,
             breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
             right = FALSE
           ), gender, Wave) %>% summarize(numTot = n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age", "gender", "Wave")) %>% mutate(propWave = num /
                                                                              numTot)

d3.sample$education_imp <-
  factor(d3.sample$education_imp, levels = c("High", "Medium", "Low"))

# graph of the distribution of the education in the random sample from Wave 5, stratified by sex

g2s <-
  ggplot(d3.sample,
         aes(
           x = Age,
           y = propWave,
           label =  round(propWave, 2),
           fill = education_imp
         )) +  geom_bar(aes(fill = education_imp, group = education_imp), stat =
                          "identity") +  #labs(y="Proportion", title="Distribution of education in the random sample from Wave 5 (2013)", fill="Education", group="Education", x="Age group")
  labs(
    y = "Proportion",
    title = "Responders (2013)",
    fill = "Education",
    group = "Education",
    x = "Age group"
  ) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(gender ~
                                                                              .) + #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")

# graph of the distribution of the education in the population, stratified by sex

g2p <-
  ggplot(d3.pop,
         aes(
           x = AgeCat,
           y = propWave5,
           label =  round(propWave5, 2),
           fill = ISCED.cat
         )) + facet_grid(Sex ~ .) + geom_bar(aes(fill = ISCED.cat, group = ISCED.cat), stat =
                                               "identity") +
  labs(
    title = "Population (2013)",
    fill = "Education",
    group = "Education",
    y = "Proportion",
    x = "Age group"
  ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() +  theme(legend.position = "none")

# displaying graphs
g2s

g2p

# saving the graphs

g2p <- g2p + theme(legend.position = "bottom")
g2s <- g2s + theme(legend.position = "none") + labs(y = "")

g <- g2p + g2s
#g # does not fit in the output, displaying separately
ggsave(
  "Figures/educationW5All.png",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5
)

Wave 6, all participants

All respondents from Wave 6 vs population

# Wave 6, all participants
my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 100.1)


d3.sample <- filter(share1, Wave == "Wave 6") %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE), gender, Wave) %>% summarize(num =
                                                                                                n())

num.tot <-
  share1 %>%  filter(Wave == "Wave 6") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE),  Wave) %>% summarize(numTot =
                                                                                         n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age",  "Wave")) %>% mutate(propWave = num /
                                                                     numTot)

The analysis included 3514 participants from the random sample Wave 6.

Sex

g2s <-
  ggplot(d3.sample, aes(
    x = Age,
    y = propWave,
    label =  round(propWave, 2),
    fill = gender
  )) +  geom_bar(aes(fill = gender, group = gender), stat = "identity") +  labs(y =
                                                                                  "Proportion", title = "Responders (2015)", fill = "Sex") + geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(. ~
                                                                                                                                                                                                                         .) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45))
  theme(legend.position = "bottom")  + theme_classic() + geom_text(
    data = d3.sample,
    aes(label = paste("n=", numTot), y = Inf),
    vjust = 1,
    size = 3
  )

g2p <-
  ggplot(d3.pop.noEdu,
         aes(
           x = AgeCat,
           y = propWave6,
           label =  round(propWave6, 2),
           fill = Sex
         )) + facet_grid(. ~ .) + geom_bar(aes(fill = Sex, group = Sex), stat = "identity") +  labs(
           title = "Population (2015)",
           fill = "Sex",
           group = "Sex",
           y = "Proportion",
           x = "Age group"
         ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme(legend.position = "bottom") + theme_classic()

g2s

g2p

# alternative graphical display

d3.sample <-
  d3.sample %>% mutate(
    upperE = propWave + 1.96 * sqrt((num / numTot) * (1 - (num / numTot)) /
                                      numTot),
    lowerE = propWave - 1.96 * sqrt((num /
                                       numTot) * (1 - (num / numTot)) / numTot)
  )




g2s.points <-
  ggplot(
    left_join(
      d3.pop.noEdu,
      mutate(d3.sample, AgeCat = Age, Sex = gender),
      by = c("AgeCat", "Sex")
    ) %>%  filter(Sex == "Female"),
    aes(AgeCat, propWave6)
  ) + geom_point(col = "red", size = 3) + geom_point(aes(y = propWave), col =
                                                       1) + theme_bw() + geom_errorbar(aes(ymin = lowerE, ymax = upperE)) +
  labs(x = "Age group", y = "Proportion of females in age group", title = "Denmark 2015 / Wave 6 - population (red) and \n responders (black with 95% CI)")


g2s.points

ggsave(
  "Figures/sexW6All.png",
  plot = g2s.points,
  scale = 1.2,
  width = 5,
  height = 5
)

The distribution of sex differs between sample and population differs more than in Wave 6, males in the younger age groups are more underrepresented.

Age

The older females are somehow underrepresented in the sample compared to the population, as are the younger men.

tmp.data <- share1 %>% filter(Wave == "Wave 6")
par(mfrow = c(1, 2))

qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2015[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]),
  tmp.data$age_int[tmp.data$gender == "Female"],
  xlab = "Age in the population (F)",
  ylab = "Age in the sample (F)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)


qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2015[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]),
  tmp.data$age_int[tmp.data$gender == "Male"],
  xlab = "Age in the population (M)",
  ylab = "Age in the sample (M)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)

my.mat <-
  rbind.data.frame(
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F"], data.pop.DK.NoEdu$X2015[data.pop.DK.NoEdu$Sex ==
                                                                                     "F"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Female"]),
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2015[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Male"])
  )

dimnames(my.mat)[[1]] <-
  c("Population females",
    "Sample W6 females",
    "Population males",
    "Sample W6 males")
dimnames(my.mat)[[2]] <-
  c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age, population 2015 vs random sample in Wave 6") %>% kable_styling()
Distribution of age, population 2015 vs random sample in Wave 6
Min. 1st Qu. Median Mean 3rd Qu. Max.
Population females 50 57 65 66.0 73 100
Sample W6 females 50 58 65 65.9 72 98
Population males 50 56 64 64.5 71 100
Sample W6 males 50 58 65 65.6 72 100

Education

The lower educated individuals are underrepresented in the sample. The differences between the sample and the population seem present in all age and sex groups - only older females have a similar distribution of education in sample and population.

d3.sample <-
  share1 %>% filter(Wave == "Wave 6") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age =
             cut(
               age_int,
               breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
               right = FALSE
             ),
           gender,
           education_imp,
           Wave) %>% summarize(num = n())


num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(Wave == "Wave 6") %>% filter(age_int <
                                                                              85) %>%
  filter(!is.na(education_imp)) %>% group_by(Age = cut(
    age_int,
    breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
    right = FALSE
  ), gender, Wave) %>% summarize(numTot = n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age", "gender", "Wave")) %>% mutate(propWave = num /
                                                                              numTot)

d3.sample$education_imp <-
  factor(d3.sample$education_imp, levels = c("High", "Medium", "Low"))

# graph of the distribution of the education in the random sample from Wave 6, stratified by sex

g2s <-
  ggplot(d3.sample,
         aes(
           x = Age,
           y = propWave,
           label =  round(propWave, 2),
           fill = education_imp
         )) +  geom_bar(aes(fill = education_imp, group = education_imp), stat =
                          "identity") +  #labs(y="Proportion", title="Distribution of education in the random sample from Wave 6 (2015)", fill="Education", group="Education", x="Age group")
  labs(
    y = "Proportion",
    title = "Responders (2015)",
    fill = "Education",
    group = "Education",
    x = "Age group"
  ) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(gender ~
                                                                              .) + #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")

# graph of the distribution of the education in the population, stratified by sex

g2p <-
  ggplot(d3.pop,
         aes(
           x = AgeCat,
           y = propWave6,
           label =  round(propWave6, 2),
           fill = ISCED.cat
         )) + facet_grid(Sex ~ .) + geom_bar(aes(fill = ISCED.cat, group = ISCED.cat), stat =
                                               "identity") +
  labs(
    title = "Population (2015)",
    fill = "Education",
    group = "Education",
    y = "Proportion",
    x = "Age group"
  ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")

# displaying graphs
g2s

g2p

# saving the graphs

g2p <- g2p + theme(legend.position = "bottom")
g2s <- g2s + theme(legend.position = "none") + labs(y = "")

g <- g2p + g2s
#g # does not fit in the output, displaying separately
ggsave(
  "Figures/educationW6All.png",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5
)

Wave 7, all participants

All respondents from Wave 7 vs population of 52+ (no refreshment samples in wave 7). (The labels of the younger age group indicate 50-55 but refer to 52-55).

# Wave 7, all participants

my.breaks <- c(50, 55, 60, 65, 70, 75, 80, 85, 101.1)


d3.sample <- filter(share1, Wave == "Wave 7") %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE), gender, Wave) %>% summarize(num =
                                                                                                n())

num.tot <-
  share1 %>%  filter(Wave == "Wave 7") %>% #filter(age_int <85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(age_int, breaks = my.breaks, right = FALSE),  Wave) %>% summarize(numTot =
                                                                                         n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age",  "Wave")) %>% mutate(propWave = num /
                                                                     numTot)

The analysis included 3025 participants from the random sample Wave 5.

Sex

g2s <-
  ggplot(d3.sample, aes(
    x = Age,
    y = propWave,
    label =  round(propWave, 2),
    fill = gender
  )) +  geom_bar(aes(fill = gender, group = gender), stat = "identity") +  labs(y =
                                                                                  "Proportion", title = "Responders (2017)", fill = "Sex") + geom_text(size = 4, position = position_stack(vjust = 0.5)) +  
  facet_grid(. ~.) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45))
  theme(legend.position = "bottom")  + theme_classic() + geom_text(
    data = d3.sample,
    aes(label = paste("n=", numTot), y = Inf),
    vjust = 1,
    size = 4
  )

g2p <-
  ggplot(d3.pop.noEdu.52,
         aes(
           x = AgeCat,
           y = propWave7,
           label =  round(propWave7, 2),
           fill = Sex
         )) + facet_grid(. ~ .) + geom_bar(aes(fill = Sex, group = Sex), stat = "identity") +  labs(
           title = "Population (2017)",
           fill = "Sex",
           group = "Sex",
           y = "Proportion",
           x = "Age group"
         ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) + scale_fill_manual(values = c("blue", "pink")) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme(legend.position = "bottom") + theme_classic()

g2s

g2p

# alternative graphical display

d3.sample <-
  d3.sample %>% mutate(
    upperE = propWave + 1.96 * sqrt((num / numTot) * (1 - (num / numTot)) /
                                      numTot),
    lowerE = propWave - 1.96 * sqrt((num /
                                       numTot) * (1 - (num / numTot)) / numTot)
  )




g2s.points <-
  ggplot(
    left_join(
      d3.pop.noEdu.52,
      mutate(d3.sample, AgeCat = Age, Sex = gender),
      by = c("AgeCat", "Sex")
    ) %>%  filter(Sex == "Female"),
    aes(AgeCat, propWave7)
  ) + geom_point(col = "red", size = 3) + geom_point(aes(y = propWave), col =
                                                       1) + theme_bw() + geom_errorbar(aes(ymin = lowerE, ymax = upperE)) +
  labs(x = "Age group", y = "Proportion of females in age group", title = "Denmark 2017 / Wave 7 - population (red) and \n responders (black with 95% CI)")


g2s.points

ggsave(
  "Figures/sexW7All.png",
  plot = g2s.points,
  scale = 1.2,
  width = 5,
  height = 5
)

As in the other waves, males in the younger age groups are underrepresented.

Age

The older females are somehow underrepresented in the sample compared to the population, as are the younger men.

tmp.data <- share1 %>% filter(Wave == "Wave 7")
par(mfrow = c(1, 2))

qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F" &
                              data.pop.DK.NoEdu$Age >= 52], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                      "F" &
                                                                                      data.pop.DK.NoEdu$Age >= 52]),
  tmp.data$age_int[tmp.data$gender == "Female"],
  xlab = "Age in the population (F)",
  ylab = "Age in the sample (F)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)


qqplot(
  rep(data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M" &
                              data.pop.DK.NoEdu$Age >= 52], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                      "M" &
                                                                                      data.pop.DK.NoEdu$Age >= 52]),
  tmp.data$age_int[tmp.data$gender == "Male"],
  xlab = "Age in the population (M)",
  ylab = "Age in the sample (M)"
)
abline(a = 0,
       b = 1,
       col = 2,
       lty = 2)

my.mat <-
  rbind.data.frame(
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "F" &
                              data.pop.DK.NoEdu$Age >= 52], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                      "F" & data.pop.DK.NoEdu$Age >= 52]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Female"]),
    summary(rep(
      data.pop.DK.NoEdu$Age[data.pop.DK.NoEdu$Sex == "M"], data.pop.DK.NoEdu$X2013[data.pop.DK.NoEdu$Sex ==
                                                                                     "M"]
    )),
    summary(tmp.data$age_int[tmp.data$gender == "Male"])
  )

dimnames(my.mat)[[1]] <-
  c("Population females",
    "Sample W7 females",
    "Population males",
    "Sample W7 males")
dimnames(my.mat)[[2]] <-
  c("Min.", "1st Qu." ,  "Median", "Mean", "3rd Qu.", "Max.")


kable(my.mat, digits = 1, caption = "Distribution of age, population 2017 vs random sample in Wave 7") %>% kable_styling()
Distribution of age, population 2017 vs random sample in Wave 7
Min. 1st Qu. Median Mean 3rd Qu. Max.
Population females 52 59 66 67.1 74 100
Sample W7 females 52 60 66 67.5 74 101
Population males 50 56 63 64.3 71 100
Sample W7 males 52 60 66 66.9 73 98

Education

The lower educated individuals are underrepresented in the sample. The differences between the sample and the population seem present in all age and sex groups - only older females have a similar distribution of education in sample and population.

d3.sample <-
  share1 %>% filter(Wave == "Wave 7") %>% filter(age_int < 85) %>%  filter(!is.na(education_imp)) %>%
  group_by(Age =
             cut(
               age_int,
               breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
               right = FALSE
             ),
           gender,
           education_imp,
           Wave) %>% summarize(num = n())


num.tot <-
  left_join(share1, share.data.weights$df.w[, c("mergeid", "subsample", "Wave")], by =
              c("mergeid", "Wave")) %>% filter(Wave == "Wave 7") %>% filter(age_int < 85) %>%
  filter(!is.na(education_imp)) %>%
  group_by(Age = cut(
    age_int,
    breaks = c(50, 55, 60, 65, 70, 75, 80, 85),
    right = FALSE
  ), gender, Wave) %>% summarize(numTot = n())

d3.sample <-
  left_join(d3.sample, num.tot, by = c("Age", "gender", "Wave")) %>% mutate(propWave = num /
                                                                              numTot)

d3.sample$education_imp <-
  factor(d3.sample$education_imp, levels = c("High", "Medium", "Low"))

# graph of the distribution of the education in the random sample from Wave 5, stratified by sex

g2s <-
  ggplot(d3.sample,
         aes(
           x = Age,
           y = propWave,
           label =  round(propWave, 2),
           fill = education_imp
         )) +  geom_bar(aes(fill = education_imp, group = education_imp), stat =
                          "identity") +  #labs(y="Proportion", title="Distribution of education in the random sample from Wave 5 (2007)", fill="Education", group="Education", x="Age group")
  labs(
    y = "Proportion",
    title = "Responders (2017)",
    fill = "Education",
    group = "Education",
    x = "Age group"
  ) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +  facet_grid(gender ~
                                                                              .) + #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")

# graph of the distribution of the education in the population, stratified by sex

g2p <-
  ggplot(d3.pop.52,
         aes(
           x = AgeCat,
           y = propWave7,
           label =  round(propWave7, 2),
           fill = ISCED.cat
         )) + facet_grid(Sex ~ .) + geom_bar(aes(fill = ISCED.cat, group = ISCED.cat), stat =
                                               "identity") +
  labs(
    title = "Population (2017)",
    fill = "Education",
    group = "Education",
    y = "Proportion",
    x = "Age group"
  ) + geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #theme(axis.text.x = element_text(angle = 45), legend.position ="bottom")
  theme_classic() + theme(legend.position = "none")

# displaying graphs
g2s

g2p

# saving the graphs

g2p <- g2p + theme(legend.position = "bottom")
g2s <- g2s + theme(legend.position = "none") + labs(y = "")

g <- g2p + g2s
#g # does not fit in the output, displaying separately
ggsave(
  "Figures/educationW7All.png",
  plot = g,
  scale = 1.2,
  width = 10,
  height = 5
)

5.2.6 Probability of loss to follow-up and death (ME2)

For the analysis purposes, the participants of some of the groups would be classified as lost to follow-up (out of sample, missingness, out-of-household if not re-included in the analysis later). Using this definition we estimate the probability of loss o follow-up, death and death after loss to follow-up. We estimated the cumulative incidence functions using Aalen-Johansen estimators for loss to follow-up and deaths (defining death times/events only for those that are not lost to follow-up - as if LOF was an absorbing state), and used Kaplan Meier estimator to estimate the probability of death after loss to follow-up (time of entry=the time of LTF, time of end=death, time of censoring = the end of the study for those who are not dead). The estimates were stratified by sex only, or by sex and age group.

# some manual checks are needed to account for ties/problems where we observe loss to follow-up after death, or similar problems

# calculating time to death from inclusion, assuming that the FU was updated in mid June 2019 - last observed time of death

# 1 - deaths, time to death or last assumed alive
sharew1.baseline$time.death <-
  sharew1.baseline$death_date - sharew1.baseline$int_date
sharew1.baseline$time.death[is.na(sharew1.baseline$time.death)] <-
  (date("2019-06-15") - sharew1.baseline$int_date)[is.na(sharew1.baseline$time.death)]
sharew1.baseline$time.death <-
  as.numeric(sharew1.baseline$time.death)

# some dates of death are missing for those that appear as dead in the DB (n=12)!
which.death.mo <-
  apply(missing.occasion.6_cv_mut, 1, function(x)
    which(x == -100)[1])

which.death <-
  which(!is.na(sharew1.baseline$death_date) | !is.na(which.death.mo))

# 21 participants are reported to be dead in a certain wave but the date is not available - imputing the time in study
which.problem <-
  which(is.na(sharew1.baseline$death_date) & !is.na(which.death.mo))
sharew1.baseline$time.death[which.problem] <-
  which.death.mo[which.problem] * 365 * 2 #imputing 2 years for each wave

# updated the definition of death
sharew1.baseline$event.death <- 0
sharew1.baseline$event.death[which(!is.na(sharew1.baseline$death_date) |
                                     !is.na(which.death.mo))] <- 1




# 2- lost to follow-up or out of sample, whichever first
# assuming that the loss occurs two years after the last avaialable interview, might be improved
which.lostfu.mo <-
  apply(missing.occasion.6_cv_mut, 1, function(x)
    which(x == -12 | x == -1000)[1])
which.lostfu <- which(!is.na(which.lostfu.mo))
sharew1.baseline$time.lostfu <- NA
sharew1.baseline$time.lostfu[!is.na(which.lostfu.mo)] <-
  sapply(which(!is.na(which.lostfu.mo)), function(x)
    int_date.mo[x, which.lostfu.mo[x] - 1] - int_date.mo[x, 1] + 365 * 2) #assigning the loss to follow up two years after the last available interview, might be improved
sharew1.baseline$event.lostfu <- 0
sharew1.baseline$event.lostfu[which.lostfu] <- 1

# out-of-household before being lost to follow-up, 4 participants
tmp <-
  which(is.na(sharew1.baseline$time.lostfu) &
          sharew1.baseline$event.lostfu)
# a
sharew1.baseline$time.lostfu[tmp] <- which.lostfu.mo[tmp] * 365 * 2

#setting the time of loss to follow-up before death for those for which the approximation led to this discrepancy
tmp <-
  which(sharew1.baseline$time.lostfu >= sharew1.baseline$time.death)
sharew1.baseline$time.lostfu[tmp] <-
  sharew1.baseline$time.death[tmp] - 1


# censored because out of the household
which.ooh <-
  which(
    apply(missing.occasion.6_cv_mut, 1, function(x)
      ! is.element(-1000, x) &
        !is.element(-12, x) & rev(x[!is.na(x)])[1] == -1001)
  )

# 3 - censoring time
which.censored <-
  which(!is.element(c(1:nrow(sharew1.baseline)), c(which.lostfu, which.death)))
which.censored.mo <- rep(-1, nrow(sharew1.baseline))
which.censored.mo[which.censored] <-
  apply(missing.occasion.6_cv_mut[which.censored, ], 1, function(x)
    which(is.na(x))[1])
which.censored.mo[is.na(which.censored.mo)] <-
  8 #setting measurement occasion to 8 for those with complete data
which.censored.mo[which.ooh] <-
  apply(missing.occasion.6_cv[which.ooh, ], 1, function(x)
    which(x == -1001)[1]) #setting measurement occasion to censoring due to out-of-household


sharew1.baseline$time.censored <- NA
sharew1.baseline$time.censored[!is.na(which.censored.mo) &
                                 which.censored.mo != -1] <-
  sapply(which(!is.na(which.censored.mo) &
                 which.censored.mo != -1), function(x)
                   int_date.mo[x, which.censored.mo[x] - 1] - int_date.mo[x, 1] + 365 * 2) #assigning the censoring time 2 years after the last available interview, might be improved
#apply(missing.occasion.6_cv[which.ooh,], 1, function(x) which(x==-1001)[1])


sharew1.baseline$event.censored <- 0
sharew1.baseline$event.censored[which.censored] <- 1
# aalen - johanesen estimator
# first step: time lost to follow-up as absorbing event

#mgus2$etime <- with(mgus2, ifelse(pstat==0, futime, ptime))
#> event <- with(mgus2, ifelse(pstat==0, 2*death, 1))
#> mgus2$event <- factor(event, 0:2, labels=c("censor", "pcm", "death"))
#> table(mgus2$event)

#sharew1.baseline$etime <- with(sharew1.baseline, ifelse(event.death==0, time.lostfu, time.death))/365.25
sharew1.baseline$etime <-
  with(sharew1.baseline,
       ifelse(event.lostfu == 1, time.lostfu, time.death)) / 365.25
sharew1.baseline$etime[which.censored] <-
  sharew1.baseline$time.censored[which.censored] / 365.25
sharew1.baseline$event <-
  with(sharew1.baseline, ifelse(event.lostfu == 0, 2 * event.death, 1))
# fixing for those lost + dead
sharew1.baseline$event[sharew1.baseline$event.death == 1 &
                         sharew1.baseline$event.lof == 1]  <- 1
sharew1.baseline$event <-
  factor(sharew1.baseline$event, 0:2, labels = c("censor", "lof", "death"))

mfit2 <- survfit(Surv(etime, event) ~ gender, data = sharew1.baseline)
#print(mfit2, rmean=240, scale=12)

#plot(mfit2, col=c(1,2,1,2), lty=c(2,2,1,1),
#mark.time=FALSE, lwd=2, xscale=1,
#xlab="Years post diagnosis", ylab="Probability in State")
#legend("topleft", c("death:female", "death:male", "loss to follow-up:female", "loss to follow-up:male"),
#col=c(1,2,1,2), lty=c(1,1,2,2), lwd=2, bty='n')

png(
  "Figures/CIFAAjAll.png",
  height = 7,
  width = 14,
  units = "in",
  res = 600,
  pointsize = 12
)

plot(
  mfit2,
  col = c(1, 2, 1, 2),
  lty = c(2, 2, 1, 1),
  mark.time = FALSE,
  lwd = 2,
  xscale = 1,
  xlab = "Years post diagnosis",
  ylab = "Probability in State"
)
legend(
  "topleft",
  c(
    "death:female",
    "death:male",
    "loss to follow-up:female",
    "loss to follow-up:male"
  ),
  col = c(1, 2, 1, 2),
  lty = c(1, 1, 2, 2),
  lwd = 2,
  bty = 'n'
)


graphics.off()

mfit3 <-
  survfit(Surv(etime, event) ~ gender + age_int_cat, data = sharew1.baseline)

mfit3.sexOnly <-
  survfit(Surv(etime, event) ~ gender, data = sharew1.baseline)

# example of plot of the results, not used

#png("Figures/CIFAAj.png", height = 7, width = 14, units = "in", res= 600,  pointsize = 12)

#par(mfrow=c(1,2))

#plot(mfit3[,3], col=rep(c("blue","pink"), each=4), lty=rep(1:4, 2), lwd=1:4, mark.time=FALSE,  xscale=1,
#xlab="Years post first interview", ylab="Probability of death", xlim =c(0,13), ylim=c(0, 1))

#legend("topleft", legend=c("50-59 years at inclusion", "60-69", "70-79", "80+"), lty=1:4, lwd=1:4)
#legend(x=0, y=0.65, legend=c("Males", "Females"), fill=c("blue", "pink"))


#plot(mfit3[,2], col=rep(c("blue","pink"), each=4), lty=rep(1:4, 2), lwd=1:4, mark.time=FALSE,  xscale=1,
#xlab="Years post first interview", ylab="Probability of loss to follow-up", xlim=c(0,13), ylim=c(0, 1))

#dev.off()
# aalen - johanesen estimator
# second step: time lost to follow-up is not an absorbing event, not used in the outputs

data3 <-
  tmerge(
    sharew1.baseline,
    sharew1.baseline,
    id = mergeid,
    death = event(time.death, event.death),
    lostfu = event(time.lostfu, event.lostfu)
  )

data3 <- tmerge(data3, data3, mergeid, enum = cumtdc(tstart))

#with(data3, table(event.death, event.lostfu))


temp <- with(data3, ifelse(death == 1, 2, event.lostfu))
data3$event <-
  factor(temp, 0:2, labels = c("censor", "lostfu", "death"))


mfit2 <-
  survfit(Surv(tstart, tstop, event) ~ gender,
          data = data3,
          id = mergeid)
# print(mfit2, rmean=240, digits=2)


mfit3b <-
  survfit(
    Surv(tstart / 365, tstop / 365, event) ~ gender + age_int_cat,
    data = data3,
    id = mergeid
  )
# print(mfit3b, rmean=5, digits=2)



png(
  "Figures/CIFAAj2MS.png",
  height = 7,
  width = 14,
  units = "in",
  res = 600,
  pointsize = 12
)

par(mfrow = c(1, 2))

plot(
  mfit3b[, 3],
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability of death",
  xlim = c(0, 13),
  ylim = c(0, 1)
)

legend(
  "topleft",
  legend = c("50-59 years at inclusion", "60-69", "70-79", "80+"),
  lty = 1:4,
  lwd = 1:4
)
legend(
  x = 0,
  y = 0.65,
  legend = c("Males", "Females"),
  fill = c("blue", "pink")
)


plot(
  mfit3b[, 2],
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability of loss to follow-up",
  xlim = c(0, 13),
  ylim = c(0, 1)
)

#legend("topleft", legend=c("50-59 years at inclusion", "60-69", "70-79", "80+"), lty=1:4, lwd=1:4)
#legend(x=0, y=0.65, legend=c("Males", "Females"), fill=c("blue", "pink"))

graphics.off()

# mfit3b$transitions


#sharew1.baseline$etime <- with(sharew1.baseline, ifelse(event.death==0, time.lostfu, time.death))/365.25
sharew1.baseline$etime <-
  with(sharew1.baseline,
       ifelse(event.lostfu == 1, time.lostfu, time.death)) / 365.25
sharew1.baseline$etime[which.censored] <-
  sharew1.baseline$time.censored[which.censored] / 365.25
sharew1.baseline$event <-
  with(sharew1.baseline, ifelse(event.lostfu == 0, 2 * event.death, 1))
# fixing for those lost + dead
sharew1.baseline$event[sharew1.baseline$event.death == 1 &
                         sharew1.baseline$event.lof == 1]  <- 1
sharew1.baseline$event <-
  factor(sharew1.baseline$event, 0:2, labels = c("censor", "lof", "death"))

#overall follow-up time for death, time to death or last follow-up time
sharew1.baseline$time.fu <- sharew1.baseline$time.death
sharew1.baseline$time.fu[which(sharew1.baseline$event.lostfu == 1 &
                                 sharew1.baseline$event.death == 0)] <-
  sharew1.baseline$time.lostfu[which(sharew1.baseline$event.lostfu == 1 &
                                       sharew1.baseline$event.death == 0)]
sharew1.baseline$time.fu[sharew1.baseline$event.censored == 1] <-
  sharew1.baseline$time.censored[sharew1.baseline$event.censored == 1]

#overall follow-up time for loss to follow-up, time to loss to follow-up or last info
sharew1.baseline$time.lofu <- sharew1.baseline$time.lostfu
sharew1.baseline$time.lofu[sharew1.baseline$event.lof == 0 &
                             sharew1.baseline$event.death == 1] <-
  sharew1.baseline$time.death[sharew1.baseline$event.lof == 0 &
                                sharew1.baseline$event.death == 1]
sharew1.baseline$time.lofu[sharew1.baseline$event.censored == 1] <-
  sharew1.baseline$time.censored[sharew1.baseline$event.censored == 1]
# CIF deaths after LOF, third output of the analysis

which.mergeid <- data3[data3$lostfu == 1, "mergeid"]

data4 <-
  filter(data3, lostfu == 0 & is.element(mergeid, which.mergeid))

#mfit4 <- survfit(Surv(tstop/365-tstart/365, death) ~ gender+age_int_cat, data=data4)

mfit5 <-
  survfit(Surv(tstart / 365, tstop / 365, death) ~ gender + age_int_cat, data =
            data4[data4$tstart >= 365 * 2, ])

mfit5.sexOnly <-
  survfit(Surv(tstart / 365, tstop / 365, death) ~ gender, data = data4[data4$tstart >=
                                                                          365 * 2, ])


png(
  "Figures/CIF_paper_v2.png",
  height = 10,
  width = 30,
  units = "cm",
  res = 600,
  pointsize = 16
)


par(mfrow = c(1, 3))

plot(
  mfit3[, 2],
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability loss to follow-up",
  xlim = c(0, 13),
  ylim = c(0, 1)
)


legend(
  "topleft",
  legend = c("50-59 years at inclusion", "60-69", "70-79", "80+"),
  lty = 1:4,
  lwd = 1:4
)

#legend("topright", legend=c("Males", "Females"), fill=c("blue", "pink"))
legend(
  x = 0,
  y = 0.55,
  legend = c("Males", "Females"),
  fill = c("blue", "pink")
)


plot(
  mfit3[, 3],
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability of death (before loss to follow-up)",
  xlim = c(0, 13),
  ylim = c(0, 1)
)


plot(
  mfit5,
  fun = function(x)
    1 - x,
  ylab = "Probability of death after loss to follow-up",
  xlab = "Years  post first interview" ,
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlim = c(0, 13),
  ylim = c(0, 1)
)

graphics.off()



### figure 6 for paper
tiff(
  "FiguresSubmission/Fig6.tif",
  height = 10,
  width = 30,
  units = "cm",
  res = 600,
  pointsize = 16,
  compression = "lzw"
)



par(mfrow = c(1, 3))

plot(
  mfit3[, 2],
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability loss to follow-up",
  xlim = c(0, 13),
  ylim = c(0, 1)
)


legend(
  "topleft",
  legend = c("50-59 years at inclusion", "60-69", "70-79", "80+"),
  lty = 1:4,
  lwd = 1:4
)

#legend("topright", legend=c("Males", "Females"), fill=c("blue", "pink"))
legend(
  x = 0,
  y = 0.55,
  legend = c("Males", "Females"),
  fill = c("blue", "pink")
)


plot(
  mfit3[, 3],
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability of death (before loss to follow-up)",
  xlim = c(0, 13),
  ylim = c(0, 1)
)




plot(
  mfit5,
  fun = function(x)
    1 - x,
  ylab = "Probability of death after loss to follow-up",
  xlab = "Years  post first interview" ,
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlim = c(0, 13),
  ylim = c(0, 1)
)

graphics.off()



#######




############ sex only #################

png(
  "Figures/CIF_paper_v2_sexOnly.png",
  height = 10,
  width = 30,
  units = "cm",
  res = 600,
  pointsize = 12
)


par(mfrow = c(1, 3))

plot(
  mfit3.sexOnly[, 2],
  col = c("blue", "pink"),
  lty = 1,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability loss to follow-up",
  xlim = c(0, 13),
  ylim = c(0, 1)
)


legend("topright",
       legend = c("Males", "Females"),
       fill = c("blue", "pink"))




plot(
  mfit3.sexOnly[, 3],
  col = c("blue", "pink"),
  lty = 1,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability of death (before loss to follow-up)",
  xlim = c(0, 13),
  ylim = c(0, 1)
)



#legend("topleft", legend=c("50-59 years at inclusion", "60-69", "70-79", "80+"), lty=1:4, lwd=1:4)
#legend(x=0, y=0.65, legend=c("Males", "Females"), fill=c("blue", "pink"))


plot(
  mfit5.sexOnly,
  fun = function(x)
    1 - x,
  ylab = "Probability of death after loss to follow-up",
  xlab = "Years  post first interview" ,
  col = c("blue", "pink"),
  lty = 1,
  mark.time = FALSE,
  xscale = 1,
  xlim = c(0, 13),
  ylim = c(0, 1)
)

#legend("topleft", legend=c("50-59 years at inclusion", "60-69", "70-79", "80+"), lty=1:4, lwd=1:4)
#legend(x=0, y=0.65, legend=c("Males", "Females"), fill=c("blue", "pink"))

graphics.off()

Overall, the estimated probability of loss to follow-up increases most notably at the second interview (about 20% 2 years after the baseline interview), and it increased up to 40% by the end of the study. The estimated probability of death by the end of the study was about 20% prior to drop-out, and about 35% after post drop-out, somehow larger for males.

# plots to display

par(mfrow = c(1, 3))

plot(
  mfit3.sexOnly[, 2],
  col = c("blue", "pink"),
  lty = 1,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability loss to follow-up",
  xlim = c(0, 13),
  ylim = c(0, 1)
)


legend("topright",
       legend = c("Males", "Females"),
       fill = c("blue", "pink"))




plot(
  mfit3.sexOnly[, 3],
  col = c("blue", "pink"),
  lty = 1,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability of death (before loss to follow-up)",
  xlim = c(0, 13),
  ylim = c(0, 1)
)



#legend("topleft", legend=c("50-59 years at inclusion", "60-69", "70-79", "80+"), lty=1:4, lwd=1:4)
#legend(x=0, y=0.65, legend=c("Males", "Females"), fill=c("blue", "pink"))


plot(
  mfit5.sexOnly,
  fun = function(x)
    1 - x,
  ylab = "Probability of death after loss to follow-up",
  xlab = "Years  post first interview" ,
  col = c("blue", "pink"),
  lty = 1,
  mark.time = FALSE,
  xscale = 1,
  xlim = c(0, 13),
  ylim = c(0, 1)
)

#legend("topleft", legend=c("50-59 years at inclusion", "60-69", "70-79", "80+"), lty=1:4, lwd=1:4)
#legend(x=0, y=0.65, legend=c("Males", "Females"), fill=c("blue", "pink"))

The probability of loss to follow-up was virtually the same across age and sex. In contrast, the probability of death prior and post dropout substantially increased with age as expected, and tended to be higher for men at younger ages.

# plots to display

par(mfrow = c(1, 3))

plot(
  mfit3[, 2],
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability loss to follow-up",
  xlim = c(0, 13),
  ylim = c(0, 1)
)


legend(
  "topleft",
  legend = c("50-59 years at inclusion", "60-69", "70-79", "80+"),
  lty = 1:4,
  lwd = 1:4
)
legend("topright",
       legend = c("Males", "Females"),
       fill = c("blue", "pink"))




plot(
  mfit3[, 3],
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlab = "Years post first interview",
  ylab = "Probability of death (before loss to follow-up)",
  xlim = c(0, 13),
  ylim = c(0, 1)
)



#legend("topleft", legend=c("50-59 years at inclusion", "60-69", "70-79", "80+"), lty=1:4, lwd=1:4)
#legend(x=0, y=0.65, legend=c("Males", "Females"), fill=c("blue", "pink"))


plot(
  mfit5,
  fun = function(x)
    1 - x,
  ylab = "Probability of death after loss to follow-up",
  xlab = "Years  post first interview" ,
  col = rep(c("blue", "pink"), each = 4),
  lty = rep(1:4, 2),
  lwd = 1:4,
  mark.time = FALSE,
  xscale = 1,
  xlim = c(0, 13),
  ylim = c(0, 1)
)

#legend("topleft", legend=c("50-59 years at inclusion", "60-69", "70-79", "80+"), lty=1:4, lwd=1:4)
#legend(x=0, y=0.65, legend=c("Males", "Females"), fill=c("blue", "pink"))

5.2.7 Dropout effect on outcome (ME3)

5.2.7.1 Mean profiles of outcome by time of death

The graphs below show the average grip strength for groups of participants stratified by the measurement occasion of death, participants with complete data (7 observations, category named still in the cohort) are also displayed for comparison. The analyses were stratified by sex and age group. The 70-80 and the 80+ age groups were merged due to the small number of participants that entered the study at an age older than 80. The purpose of this IDA analyses is to provide empirical evidence of informative drop-out.

Participants that die during the study have, from inclusion, lower values of grip strength compared to others, especially among men.

#matrix with Wave (names of the waves if not missing, NA otherwise)
Wave.all <- select(sharew1, starts_with("Wave"))

#matrix with the maxgrip, NA if missing
mg <-  select(sharew1, starts_with("maxgrip"))
wavepart <-
  apply(mg, 1, function(x)
    paste0(c(1, 2, 3, 4, 5, 6, 7)[!is.na(x)], collapse = "")) #wideformat


#maxgrip defined by occasion of measurement
#mg.occasion <-matrix(NA, ncol=num.waves, nrow=num.obs.w1)

#for(i in 1:num.waves)
#  mg.occasion[,i] <- cbind.data.frame(mg, NA, NA, NA, NA, NA, NA, NA)[cbind(seq_along(baseline.wave.col), baseline.wave.col+i-1)]
#dimnames(mg.occasion)[[2]] <- paste0("M" , 1:num.waves)

missing.occasion.5_cv_factors <-
  apply(missing.occasion.5_cv, 2, function(x)
    factor(
      x,
      levels = c(-1000,-100,-12,-11, 1),
      labels = c(
        "Out-of-sample",
        "Death",
        "Lost to FU",
        "Intermittent NA",
        "Interview"
      )
    ))


df <-
  cbind.data.frame(
    id = sharew1.baseline$mergeid,
    mg.occasion,
    missing.occasion.5_cv_factors,
    gender = sharew1.baseline$gender,
    age_int_cat = sharew1.baseline$age_int_cat,
    age_int = sharew1.baseline$age_int
  )

dimnames(df)[[2]] <-
  c(
    "ID",
    paste0("M", 1:num.waves),
    paste0("I", 1:num.waves),
    "gender",
    "age_int_cat",
    "age_int"
  )

df.melt  <- reshape2::melt(df)

#missingness in M2
# updated Aug2022, some problems with the out of sample that could be followed by an interview in some cases
# codes with some examples: "DK-513768-01", "DK-870006-02", "DK-095632-02", "DK-212267-02", "DK-538968-02", "DK-212267-02"

# some of the cases where out of sample is followed by some interview - for this reason recoding was needed


#tmp <-select(df, starts_with("I"))
tmp <- select(df, c("I1", "I2", "I3", "I4", "I5", "I6", "I7"))
which.d <- apply(tmp, 1, function(x)
  which(x == "Death")[1]) - 1
#which.d[is.na(which.d)] <- "Alive"
which.d[is.na(which.d)] <- "Still in the \ncohort"
#which.d[is.na(which.d)&!is.na(df$I7)] <- "Still in the \ncohort with 7 measurements"


which.d <- factor(which.d)


#which.lof <- apply(tmp,1, function(x) which(x=="Lost to FU")[1])-1

which.lof <-
  apply(tmp, 1, function(x)
    which(x == "Lost to FU" | x == "Out-of-sample")[1]) - 1

#which.lof <- apply(tmp,1, function(x) which(x =="Out-of-sample")[1])-1


# MO of last interview, for few subjects classified as out of sample there is a later interview, happens for 9 participants
which.lastInterview <-
  apply(tmp, 1, function(x)
    rev(which(x == "Interview"))[1] - 1)


# does not fix the problem with the 9 participants, bug
#which.lof[which(which.lastInterview > which.lof)] <- apply(tmp[which(which.lastInterview > which.lof),],1, function(x) which(x=="Lost to FU"| x =="Out-of-sample")[1])-1

which.lof[which(which.lastInterview > which.lof)]  <- NA

which.lof[is.na(which.lof)] <- "Complete"


which.lof <- factor(which.lof)
#which.lastInterview <- factor(which.lastInterview)

num.possible.interviews <- apply(tmp, 1, function(x)
  sum(!is.na(x)))


tmp <-
  cbind.data.frame(df,
                   which.d,
                   which.lof,
                   num.possible.interviews,
                   which.lastInterview)

tmp$age_int_cat70 <- tmp$age_int_cat
levels(tmp$age_int_cat70) <- c("50-59", "60-69" , "70+", "70+")

df.melt5  <-
  reshape2::melt(
    tmp,
    id.vars = c(
      "ID",
      "age_int_cat70",
      "gender",
      "which.d",
      "which.lof",
      "num.possible.interviews"
    ),
    measure.vars = c("M1", "M2", "M3", "M4", "M5", "M6", "M7")
  )

p5 <-
  ggplot(
    df.melt5 %>% filter(
      which.d == "Still in the \ncohort" &
        num.possible.interviews == 7 | which.d != "Still in the \ncohort"
    ),
    aes(variable, value, group = which.d)
  ) +
  #geom_point(alpha=0.1) +
  #geom_line(alpha=.1) +
  stat_summary(aes(group = which.d, color = which.d),
               geom = "line",
               fun = mean) +
  stat_summary(aes(group = which.d, color = which.d),
               geom = "point",
               fun = mean) +
  facet_grid(gender ~ age_int_cat70) +
  labs(x = "Measurement occasion", y = "Average grip strength (kg) of the group", color = "Death at measurement occasion") + theme_bw() + theme(legend.position = "bottom")
p5

ggsave(
  "Figures/ProfileDeaths_v3.png",
  height = 6,
  width = 8,
  plot = ggplot2::last_plot(),
  scale = .7
)

5.2.7.2 Mean profiles of outcome by time to loss to follow-up

A similar analysis was also performed stratifying the participants by the measurement occasion of last available interview, if later interviews were missing (even though it is possible that participants will participate again future waves, as they have not all been excluded from the study and intermittent missingness is possible). Participants that died during the study are excluded from the graph. Participants in the category Complete include those with complete information (7 available measurements).

The difference in mean outcome between complete and incomplete cases due to missingness is smaller compared to what was observed for death and specific trends are not observed.

#### losses to follow-up -
#### as for deaths: in the group with complete follow-up we consider only those with 7 measurements


# set to missing those not lost to follow-up but defined as complete
tmp$which.lof[tmp$num.possible.interviews < 7 &
                which.lof == "Complete"] <- NA

df.melt5  <-
  reshape2::melt(
    tmp,
    id.vars = c(
      "ID",
      "age_int_cat70",
      "gender",
      "which.d",
      "which.lof",
      "num.possible.interviews"
    ),
    measure.vars = c("M1", "M2", "M3", "M4", "M5", "M6", "M7")
  )


p6 <-
  ggplot(
    df.melt5 %>% filter(which.d == "Still in the \ncohort" &
                          !is.na(which.lof)),
    aes(variable, value, group = which.lof)
  ) +
  #geom_point(alpha=0.1) +
  #geom_line(alpha=.1) +
  stat_summary(aes(group = which.lof, color = which.lof),
               geom = "line",
               fun = mean) +
  stat_summary(aes(group = which.lof, color = which.lof),
               geom = "point",
               fun = mean) +
  facet_grid(gender ~ age_int_cat70) +
  labs(x = "Measurement occasion", y = "Average grip strength (kg) of the group", color = "Last interview") + theme_bw() + theme(legend.position = "bottom")
p6

ggsave(
  "Figures/ProfileLOF_v5.png",
  height = 6,
  width = 8,
  plot = ggplot2::last_plot(),
  scale = .7
)



### figure 7 for paper

p5 + p6

ggsave(
  "FiguresSubmission/Fig7.tif",
  height = 7.5,
  width = 15,
  plot = ggplot2::last_plot(),
  scale = .7,
  device = "tiff",
  dpi = 600,
  compression = "lzw"
)

#####end figure 7################

5.3 Univariate descriptions

5.3.1 Description of variables at baseline (U1)

Here we describe the distribution of the outcome and of the explanatory variables at baseline.

# Mark Baillie's code
source("scripts/ida_plot_univar.R")

The overall summary of all the variables from analysis strategy at baseline (categorical and numerical) is given in the table below. We report the distribution of the physical activity variables using four and two levels only in this summary. Later only the binary variables that will be used in modelling are summarized.

Overall characteristics at baseline.
N

N=5452
gender : Female 5452 0.53 2869/5452
age_int 5452 53.0 60.0 70.0
62.5 ± 10.3
age_int_cat : 50-59 5452 0.47 2576/5452
  60-69 0.28 1502/5452
  70-80 0.19 1012/5452
  80+ 0.07 362/5452
weight 5361 65.0 75.0 85.0
76.4 ± 15.2
height_imp 5418 165.00 171.00 178.00
171.42 ±  9.13
education_imp : Low 5428 0.22 1191/5428
  Medium 0.39 2130/5428
  High 0.39 2107/5428
pa_vig : More than once a week 5423 0.46 2519/5423
  Once a week 0.14 755/5423
  One to three times a month 0.07 368/5423
  Hardly ever, or never 0.33 1781/5423
pa_vig_freq 5423 0.6 3274/5423
pa_low : More than once a week 5422 0.81 4400/5422
  Once a week 0.09 486/5422
  One to three times a month 0.03 172/5422
  Hardly ever, or never 0.07 364/5422
pa_low_freq 5422 0.9 4886/5422
cusmoke_imp : Yes 5423 0.26 1395/5423
maxgrip 5272 28.0 35.0 47.0
37.1 ± 12.9
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.

At baseline interview most participants were in the younger age groups, the vast majority reported low-intensity physical activity at least once a week, 63% vigorous physical activity at least once a week. About a quarter were smokers, the most common education level was high and there were slighly more females than men.

#to generate the latex tables
s <-
    Hmisc::summaryM(
        gender + age_int+ age_int_cat+  region  + education_imp + weight + height_imp + pa_vig_freq +  pa_low_freq + cusmoke_imp  ~1 ,
        data =sharew1.baseline,
        overall = FALSE,
        test = FALSE
    )


mylatex <- function (...) {
    o <- capture.output(latex(file = "", where = "!htbp", ...))
    # this will strip /all/ line-only comments; or if you're only
    #  interested in stripping the first such comment you could
    #  adjust accordingly
    o <- grep('^%', o, inv=T, value=T)
    cat(o, sep='\n')
}


#TEX TO EDIT THE TABLE FOR THE PAPER
tex.table <- latex(
    s, digits=2, what="%", file="out/OverallT1.tex")


s.gender <-
    Hmisc::summaryM(
        age_int+ age_int_cat+  region  + education_imp + weight + height_imp + pa_vig_freq +  pa_low_freq + cusmoke_imp  ~gender ,
        data =sharew1.baseline,
        overall = FALSE,
        test = FALSE
    )


#TEX TO EDIT THE TABLE FOR THE PAPER
tex.table <- latex(
    s.gender, digits=2, what="%", file="out/ByGenderT1.tex")

The distribution of the numerical variables is reported also graphically (the graphical display of categorical variables is omitted).

5.3.1.1 Graphical display for numerical variables at baseline

5.3.1.1.1 Age
## This is a function the plots a strip plot, histogram and boxplot, including five number summary. 
ida_plot_univar(sharew1.baseline, "age_int")

The distribution of age at baseline was positively asymmetric.

5.3.1.1.2 Weight
ida_plot_univar(sharew1.baseline, "weight")

The variable was reported with digit preference (values ending with 0 and 5 were more frequent than expected)

5.3.1.1.3 Height
ida_plot_univar(sharew1.baseline, "height_imp")

The variable was reported with digit preference (values ending with 0 and 5 were more frequent than expected)

5.3.1.1.4 Grip strength
ida_plot_univar(sharew1.baseline, "maxgrip")

The variable was reported with digit preference (values ending with 0 and 5 were more frequent than expected); the distribution is bimodal, reflecting the large difference in the location of the distribution for men and females.

The characteristics observed at baseline were observed also at following measurement occasions.

5.3.1.1.5 Further exploration of digit preference for grip strength

We present the grip strength data with barplots, where the bars of the values with numbers ending with 0 or 5 are plotted in red. Here we display all the available measurements (data by wave were displayed previously).

All the peaks that deviate from the expected shape of the distribution are associated to values that end with 0 or 5.

my.tab <- table(share1$maxgrip)

my.values <- as.numeric(dimnames(my.tab)[[1]])

my.colors <- rep(1, length(my.values))
my.colors[c(which(my.values %% 5 == 0))] <- "gray"
my.colors[c(which(my.values %% 0 == 0))] <- "gray"


png(
  "figures/barplotMaxgripDP.png",
  width = 10,
  height = 5,
  units = "in",
  res = 600
)

b <-
  barplot(
    table(share1$maxgrip),
    col = my.colors,
    main = "All waves",
    xpd = FALSE,
    axes = FALSE,
    xlab = "Grip strength (kg)",
    ylab = "Number of measurements",
    xaxt = 'n',
    cex.names = 2,
    cex.axis = 2
  )
axis(1, at = b[c(which(my.values %% 5 == 0), which(my.values %% 0 == 0))], labels  = my.values[c(which(my.values %%
                                                                                                         5 == 0), which(my.values %% 0 == 0))])
axis(2)

graphics.off()

#### figure 10 for paper

tiff(
  "figuresSubmission/fig10.tif",
  width = 10,
  height = 5,
  units = "in",
  res = 600,
  compression = "lzw"
)

b <-
  barplot(
    table(share1$maxgrip),
    col = my.colors,
    main = "All waves",
    xpd = FALSE,
    axes = FALSE,
    xlab = "Grip strength (kg)",
    ylab = "Number of measurements",
    xaxt = 'n',
    cex.names = 2,
    cex.axis = 2
  )
axis(1, at = b[c(which(my.values %% 5 == 0), which(my.values %% 0 == 0))], labels  = my.values[c(which(my.values %%
                                                                                                         5 == 0), which(my.values %% 0 == 0))])
axis(2)

graphics.off()

##########
my.tab <- table(sharew1.baseline$maxgrip)

my.values <- as.numeric(dimnames(my.tab)[[1]])

my.colors <- rep(1, length(my.values))
my.colors[c(which(my.values %% 5 == 0))] <- "gray"
my.colors[c(which(my.values %% 0 == 0))] <- "gray"


#png("figures/barplotMaxgripDPBaseline.png", width = 10, height = 5, units = "in", res=600)

b <-
  barplot(
    table(sharew1.baseline$maxgrip),
    col = my.colors,
    main = "Baseline measurements",
    xpd = FALSE,
    axes = FALSE,
    xlab = "Grip strength (kg)",
    ylab = "Number of measurements",
    xaxt = 'n'
  )
axis(1, at = b[c(which(my.values %% 5 == 0), which(my.values %% 0 == 0))], labels  = my.values[c(which(my.values %%
                                                                                                         5 == 0), which(my.values %% 0 == 0))])
axis(2)

5.3.2 Description of the time varying variables at later times (U2)

Here we summarize the longitudinal data of outcome and time-varying independent variables, stratifying the summary statistics by wave. Note that as wave is the time metric of the data collection process, the summaries stratified by wave can be used for the identification of data collection problems. The longitudinal trends of the time varying variables are summarized later (L2 for the outcome and L4 for the time-varying variables).

The digit preference was observed in all waves for weight and the outcome; the proportions did not vary greatly for categorical variables. The changes for age were described in previous sections.

s <-
  Hmisc::summaryM(
    #Wave+ 
    gender+ age_int+ age_int_cat+ weight+ pa_vig_freq+ pa_low_freq+ maxgrip ~ Wave,
    data = share1,
    overall = FALSE,
    test = FALSE
  )

Hmisc::html(
  s,
  caption = 'Overall baseline characteristics across waves',
  exclude1 = TRUE,
  npct = 'both',
  digits = 3,
  prmsd = TRUE,
  brmsd = TRUE,
  msdsize = mu$smaller2, 
  scroll = TRUE
)
Overall baseline characteristics across waves.
N
Wave 1
N=1596
Wave 2
N=2487
Wave 3
N=1979
Wave 4
N=2112
Wave 5
N=3919
Wave 6
N=3514
Wave 7
N=3025
gender : Female 18632 0.53 850/1596 0.53 1330/2487 0.54 1069/1979 0.53 1122/2112 0.53 2071/3919 0.53 1858/3514 0.53 1604/3025
age_int 18632 56.00 62.00 72.00
64.40 ± 10.58
56.00 63.00 72.00
64.53 ± 10.30
58.00 64.00 73.00
65.79 ±  9.93
57.00 64.00 72.00
65.11 ± 10.53
57.00 64.00 72.00
65.39 ± 10.08
58.00 65.00 72.00
65.77 ± 10.03
60.00 66.00 73.00
67.23 ±  9.52
age_int_cat : 50-59 18632 0.40 644/1596 0.38 946/2487 0.32 641/1979 0.36 754/2112 0.34 1317/3919 0.32 1121/3514 0.25 748/3025
  60-69 0.29 455/1596 0.31 783/2487 0.35 687/1979 0.33 707/2112 0.35 1376/3919 0.35 1231/3514 0.37 1122/3025
  70-80 0.22 353/1596 0.22 538/2487 0.23 448/1979 0.21 438/2112 0.22 859/3919 0.23 820/3514 0.28 845/3025
  80+ 0.09 144/1596 0.09 220/2487 0.10 203/1979 0.10 213/2112 0.09 367/3919 0.10 342/3514 0.10 310/3025
weight 16356 65.0 74.0 84.0
74.7 ± 14.6
65.0 75.0 85.0
75.5 ± 14.7
65.0 75.0 85.0
76.4 ± 15.3
65.0 75.0 85.0
76.6 ± 15.4
65.0 76.0 86.5
77.2 ± 15.8
66.0 76.0 87.0
77.8 ± 15.9
pa_vig_freq 14709 0.60 956/1591 0.50 1226/2430 0.53 1097/2077 0.61 2405/3913 0.62 2178/3509 0.59 704/1189
pa_low_freq 14709 0.88 1400/1592 0.89 2165/2430 0.89 1843/2077 0.90 3510/3911 0.91 3182/3509 0.88 1042/1190
maxgrip 17929 26.0 34.0 46.0
36.1 ± 13.2
25.0 32.0 44.0
34.6 ± 12.6
27.0 34.0 46.0
36.2 ± 12.8
27.0 35.0 47.0
37.2 ± 12.9
27.0 35.0 47.0
36.9 ± 12.4
28.0 35.0 47.0
37.0 ± 12.4
27.0 35.0 46.0
36.7 ± 12.1
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.

The distribution of the numerical variables through Waves are also presented graphically.

share1 %>% 
  ggplot(aes(x=age_int)) +
  geom_histogram(binwidth=1, aes(y=stat(density), group=measurement_occasion)) +
  facet_grid(. ~ Wave) +   theme_bw() +xlab("Age at interview")

  share1 %>% 
  ggplot(aes(x=weight)) +
  geom_histogram(binwidth=1, aes(y=stat(density), group=measurement_occasion)) +
  facet_grid(. ~ Wave) +   theme_bw() +xlab("Weight (kg)")

   share1 %>% 
  ggplot(aes(x=maxgrip)) +
  geom_histogram(binwidth=1, aes(y=stat(density), group=measurement_occasion)) +
  facet_grid(. ~ Wave) +   theme_bw() +xlab("Grip strength (kg)")

5.4 Multivariate description of data

5.4.1 Associations at baseline with structural variables (V1)

Here we explore the associations between explanatory variables measured at baseline and age and sex, which are the structural variables of interest (the association with wave was explored in U2, the association with type of questionnaire in missing data).

We present the association of age and the categorical explanatory variables plotting the smoothed relationship between age and the value of the variable, stratifying by sex (categorical variables have two categories and are internally coded as 0/1, the smoothed relationship is obtained using the geom_smooth() function, method: gam). The association between education and age and sex was extensively explored in ME1 and therefore it is not presented here.

Vigorous physical activity in males decreases more sharply after 60, the descrease in vigorous PA seems more linear for females. Moderate physical activity remaines stable up to approximately 70 years and decreases sharply afterwords, males and females have similar association between activity and age. There is a low proportion of smokers at older ages, and females have smaller probability of smoking. Weight and height at baseline are negatively associated to age. Beside ageing, this might be due to the cohort effect, which is further explored in LE1, or drop-out effect.

ggplot(filter(share.data.first.interview,!is.na(pa_vig_freq)),
       aes(age_int, ifelse(pa_vig_freq == 1, 1, 0))) + geom_smooth(aes(fill =
                                                                         gender, color = gender)) + scale_fill_manual(values = c("blue", "pink")) + scale_color_manual(values = c("blue", "pink")) +
  labs(x =
         "Age at baseline",
       y = "Probability of vigorous physical activity (geom_smooth)",
       fill = "Sex",
       color = "Sex") + theme_bw()

p1 <- ggplot(filter(share.data.first.interview,!is.na(pa_low_freq)),
       aes(age_int, ifelse(pa_low_freq == 1, 1, 0))) +
  geom_smooth(aes(fill = gender, color = gender)) + scale_fill_manual(values = c("blue", "pink")) + scale_color_manual(values = c("blue", "pink")) +  labs(x = "Age at baseline",
                                                                                                                                                           y = "Probability of low-intensity physical activity (geom_smooth)",
                                                                                                                                                           fill = "Sex",
                                                                                                                                                           color = "Sex") + theme_bw()

p2 <- ggplot(filter(share1, !is.na(pa_vig_freq)), aes(age_int, ifelse(pa_vig_freq==1, 1, 0))) + geom_smooth(aes(fill=gender, color=gender)) + scale_fill_manual(values = c("blue", "pink")) + scale_color_manual(values = c("blue", "pink")) +  labs(x="Age at interview", y="Probability of vigorous physical activity (geom_smooth)", fill="Sex", color="Sex") + theme_bw() + facet_wrap(Wave~.)

p1 + p2

p3 <- ggplot(filter(share1, !is.na(pa_low_freq)), aes(age_int, ifelse(pa_low_freq==1, 1, 0))) + geom_smooth(aes(fill=gender, color=gender)) + scale_fill_manual(values = c("blue", "pink")) + scale_color_manual(values = c("blue", "pink")) +  labs(x="Age at interview", y="Probability of low-intensity physical activity (geom_smooth)", fill="Sex", color="Sex") + theme_bw() + facet_wrap(paste0("M",measurement_occasion)~.)

p4 <- ggplot(filter(share.data.first.interview,!is.na(cusmoke_imp)),
       aes(age_int, ifelse(cusmoke_imp == "Yes", 1, 0))) + geom_smooth(aes(fill =
                                                                             gender, , color = gender)) + scale_fill_manual(values = c("blue", "pink")) +   scale_color_manual(values = c("blue", "pink")) + 
  labs(x = "Age at baseline",
       y = "Probability of smoking at baseline (geom_smooth)",
       fill = "Sex",
       color = "Sex") + theme_bw()

p3 + p4 

p5 <- ggplot(filter(share.data.first.interview,!is.na(weight)),
       aes(age_int, weight)) + geom_smooth(aes(fill = gender, color = gender)) + scale_fill_manual(values = c("blue", "pink")) + scale_color_manual(values = c("blue", "pink")) + 
  labs(x =
         "Age at baseline",
       y = "Weight (geom_smooth)",
       fill = "Sex",
       color = "Sex") + theme_bw()

p6 <- ggplot(filter(share.data.first.interview, !is.na(height)),
       aes(age_int, height)) + geom_smooth(aes(fill = gender, color = gender)) + scale_fill_manual(values = c("blue", "pink")) + scale_color_manual(values = c("blue", "pink")) + 
  labs(x =
         "Age at baseline",
       y = "Height (geom_smooth)",
       fill = "Sex",
       color = "Sex") + theme_bw()

p5 + p6

5.4.2 Independent variables - Correlation (V2)

We explore the correlation between explanatory variables at baseline.

5.4.2.1 Overall correlation at baseline

Females on average have lower values of all the variables, as do older participants. The two types of physical activities are positively correlated, as are height and weight (to a larger extent). Age is negatively associated to all the explanatory variables.

df <-
  sharew1.baseline %>% select(
    gender,
    weight,
    height_imp,
    education_imp,
    pa_vig_freq,
    pa_low_freq,
    cusmoke_imp ,
    age_int
  ) %>% data.frame()
df$cusmoke_imp <- factor(df$cusmoke_imp)


dimnames(df)[[2]] <-
  c("Sex",
    "Weight",
    "Height",
    "Education",
    "PA Vig",
    "PA Low",
    "Smoking",
    "Age")

df <- df %>% mutate(across(where(is.factor), as.numeric))

Breaks <- seq(-1, 1, by = .01)
my_palette <-
  c(colorRampPalette(rev(
    RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")
  ))(length(Breaks) - 2) , "grey80", "grey80")
pheatmap(
  round(cor(df, use = "p"), 2),
  cluster_cols = FALSE,
  cluster_rows = FALSE,
  display_numbers = TRUE,
  breaks = Breaks ,
  color = my_palette
)

5.4.2.1.1 At baseline, stratified by sex

The association between education and age was larger for females, as we are considering age and education at baseline, it is most likely due to a cohort effect (further explored in LE1).

Males

df <-
  sharew1.baseline %>% filter(gender == "Male") %>% select(weight,
                                                           height_imp,
                                                           education_imp,
                                                           pa_vig_freq,
                                                           pa_low_freq,
                                                           cusmoke_imp ,
                                                           age_int) %>% data.frame()
df$cusmoke_imp <- factor(df$cusmoke_imp)

dimnames(df)[[2]] <-
  c("Weight",
    "Height",
    "Education",
    "PA Vig",
    "PA Low",
    "Smoking",
    "Age")

df <- df %>% mutate(across(where(is.factor), as.numeric))

Breaks <- seq(-1, 1, by = .01)
my_palette <-
  c(colorRampPalette(rev(
    RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")
  ))(length(Breaks) - 2) , "grey80", "grey80")

g1 <-
  pheatmap(
    round(cor(df, use = "p"), 2),
    display_numbers = TRUE,
    breaks = Breaks ,
    color = my_palette,
    treeheight_col = 0,
    main = "Males",
    fontsize = 13
  )

g1

ggsave(
  "Figures/CorrMales.png",
  plot = g1,
  scale = 1.2,
  width = 6,
  height = 5
)


df <-
  sharew1.baseline %>% filter(gender == "Female") %>% select(weight,
                                                             height_imp,
                                                             education_imp,
                                                             pa_vig_freq,
                                                             pa_low_freq,
                                                             cusmoke_imp ,
                                                             age_int) %>% data.frame()
df$cusmoke_imp <- factor(df$cusmoke_imp)
dimnames(df)[[2]] <-
  c("Weight",
    "Height",
    "Education",
    "PA Vig",
    "PA Low",
    "Smoking",
    "Age")
df <- df %>% mutate(across(where(is.factor), as.numeric))

g2 <-
  pheatmap(
    round(cor(df, use = "p"), 2),
    display_numbers = TRUE,
    breaks = Breaks ,
    color = my_palette,
    treeheight_col = 0,
    main = "Females",
    fontsize = 13
  )

g2

ggsave(
  "Figures/CorrFemales.png",
  plot = g2,
  scale = 1.2,
  width = 6,
  height = 5
)


############ figure 12 for paper
g <- grid.arrange(g1[[4]], g2[[4]], nrow = 1)

ggsave(
  "FiguresSubmission/fig12.tif",
  scale = 1.2,
  width = 10,
  height = 5,
  compression = "lzw",
  dpi = 600,
  device = "tiff",
  plot = g
)
############

5.4.2.2 Additional explorations

Here we use all the observed data (with repeated measurements) to explore the association between some of the variables, namely height and weight.

Data cleaning performed before data screening removed very low values of height (even if they were considered plausible by the data cleaning performed within SHARE - see details in the import file), but some low values of height might still be due to errors.

The association between the two variables is as expected.

Some very low values of height do not seem consistent with the weight values. For these data points also the BMI is in some cases large.

p1 <- share1 %>%
  filter(!is.na(weight) & !is.na(age_int)) %>%
  mutate(weight = as.numeric(weight),
         age = as.numeric(age_int)) %>%
  ggplot(aes(x = age, y = weight)) +
  geom_point(
    shape = 16,
    size = 0.5,
    alpha = 0.5,
    color = "black"
  ) +
  geom_rug() +
  theme_minimal()

p2 <- share1 %>%
  filter(!is.na(height_imp) & !is.na(age_int)) %>%
  mutate(height = as.numeric(height_imp),
         age = as.numeric(age_int)) %>%
  ggplot(aes(x = age, y = height_imp)) +
  geom_point(
    shape = 16,
    size = 0.5,
    alpha = 0.5,
    color = "black"
  ) +
  geom_rug() +
  theme_minimal()

p3 <- share1 %>%
  filter(!is.na(weight) & !is.na(height_imp)) %>%
  mutate(height = as.numeric(height_imp),
         weight = as.numeric(weight)) %>%
  ggplot(aes(x = height_imp, y = weight)) +
  geom_point(
    shape = 16,
    size = 0.5,
    alpha = 0.5,
    color = "black"
  ) +
  geom_rug() +
  theme_minimal()
p4 <- share1 %>%
  filter(!is.na(weight) & !is.na(bmi)) %>%
  mutate(weight = as.numeric(weight),
         bmi = as.numeric(bmi)) %>%
  ggplot(aes(x = weight, y = bmi)) +
  geom_point(shape = 16,
             size = 0.95,
             alpha = 0.5,
             aes(col = factor(ifelse(
               height_imp <= 145, TRUE, FALSE
             )))) +
  geom_rug() +
  theme_minimal() + labs(color = "Height <= 145 cm") +  scale_color_manual(values =
                                                                             c("black", "red"))
##### Association between weight and height

(p3+p4) #library patchwork

5.4.3 Interactions between explanatory variables (V3)

The analysis strategy envisions the use of interactions between age and all time fixed explanatory variables (sex, education, height), the main interest will be in the interpretation of the interaction between sex and functions of age. The descriptive statistics of all the explanatory variables stratified by age groups and sex are reported in the section V1.

5.4.3.1 Association between age and weight, stratified by physical activity

The possible interaction between age and status of vigorous/low intensity activity with respect to weight is explored here, as it might be of interest to domain experts.

Scatter plot by vigorous physical activity and sex

ggplot(share1, aes(y = weight,
                   x = age_int)) +
  geom_point(
    shape = 16,
    size = 0.5,
    alpha = 0.5,
    color = "black"
  ) +
  geom_rug() +
  facet_grid(gender ~ pa_vig_freq) +
  theme_minimal() + geom_smooth()

Scatter plot by low physical activity and gender

r, cache = TRUE
}
ggplot(share1, aes(y = weight,
x = age_int)) +
geom_point(
shape = 16,
size = 0.5,
alpha = 0.5,
color = "black"
) +
geom_rug() +
facet_grid(gender ~ pa_low_freq) +
theme_minimal() + geom_smooth()

5.4.4 Stratification (VE1)

We stratify the univariate descriptions of the data by sex and age group first; we explore also the stratification by baseline wave.

5.4.4.1 Baseline measurements stratified by sex

We limit the exploration to baseline measurements (as the wave by wave exploration is conducted on complete data in VE1). The results reported below with tables and graphs indicate the following.

Females and males differed substantially in the distribution of height, weight, vigorous (but not low-intensity) physical activity, and education. Age was similar.

The distribution of grip strength was no longer asymmetric and bimodal, when data were stratified by sex, and it seems appropriate to assume a gaussian distribution; the digit preference was visible despite the automatized method of measurement.

Baseline characteristics by sex.
N
Male
N=2583
Female
N=2869
Wave : Wave 1 5452 0.29 746/2583 0.30 850/2869
  Wave 2 0.23 603/2583 0.24 699/2869
  Wave 4 0.08 215/2583 0.07 199/2869
  Wave 5 0.34 885/2583 0.35 998/2869
  Wave 6 0.05 134/2583 0.04 123/2869
age_int 5452 53.00 60.00 69.00
62.24 ±  9.92
53.00 60.00 70.00
62.77 ± 10.66
age_int_cat : 50-59 5452 0.48 1230/2583 0.47 1346/2869
  60-69 0.28 734/2583 0.27 768/2869
  70-80 0.18 472/2583 0.19 540/2869
  80+ 0.06 147/2583 0.07 215/2869
weight 5361 75.0 82.0 92.0
84.0 ± 13.6
60.0 68.0 77.0
69.4 ± 13.2
height_imp 5418 173.00 178.00 183.00
178.09 ±  7.07
161.00 165.00 170.00
165.41 ±  6.11
education_imp : Low 5428 0.15 397/2569 0.28 794/2859
  Medium 0.47 1204/2569 0.32 926/2859
  High 0.38 968/2569 0.40 1139/2859
pa_vig_freq 5423 0.64 1632/2566 0.57 1642/2857
pa_low_freq 5422 0.91 2331/2566 0.89 2555/2856
cusmoke_imp : Yes 5423 0.27 696/2567 0.24 699/2856
maxgrip 5272 40.00 48.00 55.00
47.09 ± 10.28
24.00 28.00 33.00
28.02 ±  7.01
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.
5.4.4.1.1 Graphical presentation of stratified distributions (by sex)
5.4.4.1.1.1 Grip strength

Distribution maxgrip by sex

5.4.4.1.1.2 Age

Distribution maxgrip by gender

5.4.4.1.1.3 Weight

Distribution weight by gender

5.4.4.1.1.4 Height

Distribution height by gender

5.4.4.2 Stratification based on grouped age at baseline and sex

As age is also the time metric from the analysis strategy, in this section we explore only some aspects (related to baseline measurement) of the association between age and the other variables, stratifying by sex. More detailed explorations are presented in the sections devoted to time trends. In most analyses participants are grouped in 10 year age groups. The use of categorized age is for illustrative purposes only, and is not recommended in data analysis.

The aim of this analysis is to identify independent variables that might be associated with age and sex.

Among females the association between age and education is stronger (older participants with lower education).

5.4.4.2.1 Females
s <-
  Hmisc::summaryM(
    #Wave + 
    education_imp  + pa_vig_freq  + pa_low_freq + cusmoke_imp +  weight + height_imp +  maxgrip ~ age_int_cat,
    data = droplevels(subset(sharew1.baseline, gender=="Female")),
    overall = FALSE,
    test = FALSE
  )

Hmisc::html(
  s,
  caption = 'Baseline characteristics by age category for females',
  exclude1 = TRUE,
  npct = 'both',
  digits = 3,
  prmsd = TRUE,
  brmsd = TRUE,
  msdsize = mu$smaller2
)
Baseline characteristics by age category for females.
N
50-59
N=1346
60-69
N=768
70-80
N=540
80+
N=215
education_imp : Low 2859 0.15 197/1341 0.27 208/ 766 0.47 253/ 539 0.64 136/ 213
  Medium 0.31 422/1341 0.37 283/ 766 0.31 166/ 539 0.26 55/ 213
  High 0.54 722/1341 0.36 275/ 766 0.22 120/ 539 0.10 22/ 213
pa_vig_freq 2857 0.68 908/1344 0.59 454/ 765 0.44 236/ 538 0.21 44/ 210
pa_low_freq 2856 0.93 1253/1344 0.93 710/ 765 0.85 455/ 537 0.65 137/ 210
cusmoke_imp : Yes 2856 0.28 380/1344 0.23 177/ 765 0.21 112/ 538 0.14 30/ 209
weight 2796 62.0 69.0 79.0
71.1 ± 13.3
60.0 68.0 76.0
69.4 ± 12.5
59.0 65.5 74.0
67.5 ± 13.4
55.0 62.0 70.0
63.2 ± 11.3
height_imp 2848 163.00 167.00 171.00
166.94 ±  5.97
161.00 165.00 169.00
164.93 ±  5.83
160.00 164.00 168.00
163.48 ±  5.67
158.00 162.00 167.00
162.13 ±  6.14
maxgrip 2760 28.00 31.00 35.00
31.27 ±  6.11
24.00 28.00 31.00
27.50 ±  5.85
20.00 24.00 27.00
23.81 ±  5.79
15.00 19.00 22.00
18.85 ±  5.29
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.
5.4.4.2.2 Males
s <-
  Hmisc::summaryM(
    #Wave + 
    education_imp  +  pa_vig_freq  + pa_low_freq + cusmoke_imp + weight + height_imp + maxgrip ~ age_int_cat,
    data = droplevels(subset(sharew1.baseline, gender=="Male")),
    overall = FALSE,
    test = FALSE
  )

Hmisc::html(
  s,
  caption = 'Baseline characteristics by age category for males',
  exclude1 = TRUE,
  npct = 'both',
  digits = 3,
  prmsd = TRUE,
  brmsd = TRUE,
  msdsize = mu$smaller2
)
Baseline characteristics by age category for males.
N
50-59
N=1230
60-69
N=734
70-80
N=472
80+
N=147
education_imp : Low 2569 0.13 153/1222 0.14 103/ 732 0.21 101/ 470 0.28 40/ 145
  Medium 0.47 571/1222 0.48 352/ 732 0.46 217/ 470 0.44 64/ 145
  High 0.41 498/1222 0.38 277/ 732 0.32 152/ 470 0.28 41/ 145
pa_vig_freq 2566 0.72 879/1221 0.66 480/ 732 0.49 229/ 469 0.31 44/ 144
pa_low_freq 2566 0.94 1151/1222 0.93 680/ 732 0.87 405/ 468 0.66 95/ 144
cusmoke_imp : Yes 2567 0.31 375/1222 0.27 195/ 732 0.21 97/ 468 0.20 29/ 145
weight 2565 76.0 85.0 95.0
86.3 ± 13.8
75.0 83.0 90.0
84.2 ± 13.2
71.5 80.0 88.0
80.0 ± 12.3
70.0 75.0 80.0
75.9 ± 11.0
height_imp 2570 175.00 180.00 184.00
179.70 ±  6.87
173.00 178.00 183.00
178.22 ±  6.89
171.00 175.00 179.00
175.16 ±  6.52
169.75 173.00 178.00
173.26 ±  6.12
maxgrip 2512 47.00 53.00 58.00
51.95 ±  8.77
41.00 47.00 52.00
46.59 ±  8.20
34.00 40.00 45.00
39.47 ±  7.94
26.00 32.00 37.00
31.60 ±  8.42
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.
5.4.4.2.3 Graphical presentation of stratified distributions of numerical variables
5.4.4.2.3.1 Weight
sharew1.baseline %>% 
  filter(!(is.na(weight))) %>%
  with(., histboxp(
    x = weight,
    group = paste(gender, age_int_cat),
    sd = TRUE,
    bins = 200
  ))

Distribution weight by age (categorical)

5.4.4.2.3.2 Height
sharew1.baseline %>%
  filter(!(is.na(height_imp))) %>%
  with(., histboxp(
    x = height_imp,
    group = paste(gender, age_int_cat),
    sd = TRUE,
    bins = 200
  ))

Distribution height by age (categorical)

5.4.4.3 Stratification by wave of the baseline measurements

The overall summary of baseline measurements over waves is give in the table below (participants can be included in the study at different waves).

s <-
  Hmisc::summaryM(
    #Wave+ 
    gender+ age_int+ age_int_cat+ weight+ height_imp+ education_imp+  pa_vig_freq+ pa_low_freq+ cusmoke_imp+ maxgrip ~ Wave,
    data = sharew1.baseline,
    overall = FALSE,
    test = FALSE
  )

Hmisc::html(
  s,
  caption = 'Overall baseline characteristics across waves',
  exclude1 = TRUE,
  npct = 'both',
  digits = 3,
  prmsd = TRUE,
  brmsd = TRUE,
  msdsize = mu$smaller2, 
  scroll = TRUE
)
Overall baseline characteristics across waves.
N
Wave 1
N=1596
Wave 2
N=1302
Wave 4
N=414
Wave 5
N=1883
Wave 6
N=257
gender : Female 5452 0.53 850/1596 0.54 699/1302 0.48 199/ 414 0.53 998/1883 0.48 123/ 257
age_int 5452 56.00 62.00 72.00
64.40 ± 10.58
54.00 61.00 70.00
62.77 ± 10.01
51.00 52.00 54.00
53.24 ±  4.04
56.00 63.00 71.00
63.98 ± 10.02
51.00 52.00 52.00
53.78 ±  6.43
age_int_cat : 50-59 5452 0.40 644/1596 0.44 576/1302 0.94 390/ 414 0.39 737/1883 0.89 229/ 257
  60-69 0.29 455/1596 0.30 393/1302 0.05 19/ 414 0.33 617/1883 0.07 18/ 257
  70-80 0.22 353/1596 0.19 250/1302 0.01 4/ 414 0.21 398/1883 0.03 7/ 257
  80+ 0.09 144/1596 0.06 83/1302 0.00 1/ 414 0.07 131/1883 0.01 3/ 257
weight 5361 65.0 74.0 84.0
74.7 ± 14.6
65.0 75.0 85.0
75.5 ± 14.3
68.0 78.0 90.0
80.1 ± 16.9
65.0 75.0 86.0
76.9 ± 15.6
70.0 80.0 90.0
80.8 ± 15.4
height_imp 5418 164.00 170.00 177.00
170.46 ±  8.96
165.00 170.00 177.00
171.05 ±  9.01
167.00 174.00 180.00
173.94 ±  8.86
165.00 171.00 178.00
171.52 ±  9.20
168.00 174.00 181.00
174.46 ±  9.37
education_imp : Low 5428 0.26 406/1586 0.22 284/1293 0.10 42/ 410 0.22 422/1882 0.14 37/ 257
  Medium 0.44 696/1586 0.38 494/1293 0.35 143/ 410 0.37 698/1882 0.39 99/ 257
  High 0.31 484/1586 0.40 515/1293 0.55 225/ 410 0.40 762/1882 0.47 121/ 257
pa_vig_freq 5423 0.60 956/1591 0.50 636/1284 0.67 278/ 413 0.65 1215/1879 0.74 189/ 256
pa_low_freq 5422 0.88 1400/1592 0.89 1143/1284 0.94 388/ 413 0.92 1719/1877 0.92 236/ 256
cusmoke_imp : Yes 5423 0.31 499/1590 0.28 356/1284 0.26 106/ 413 0.20 380/1879 0.21 54/ 257
maxgrip 5272 26.0 34.0 46.0
36.1 ± 13.2
25.0 33.0 44.0
35.0 ± 12.6
33.0 43.0 55.0
43.7 ± 13.0
28.0 35.0 47.0
37.1 ± 12.2
33.0 40.0 54.8
43.0 ± 12.5
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.

Due to the study design, that envisioned the inclusion of refreshment samples limited to the younger age groups, the participants first included in Wave 4 or 6 differed from those included in the other waves: they were substantially younger and were more frequently males - the differences in age and gender should explain the difference in the other variables: higher education, higher values for weight, height, grip strength.

The numerical variables are summarized also graphically.

5.4.4.4 Age across baseline waves

 sharew1.baseline %>% 
     ggplot(aes(x=age_int)) +
     #geom_histogram(binwidth=1, aes(y=stat(density), group=Wave)) + 
     geom_histogram(binwidth=1, aes(group=Wave)) + 
     facet_grid(. ~ Wave) +
     theme_bw() +labs("Age", y="Frequency")

ggsave("figures/AgeBaselineWave.png", device="png", scale=1, width = 10, height= 3 , units="in", pointsize=12)

5.4.4.5 Weight across baseline waves

 sharew1.baseline %>% 
     ggplot(aes(x=weight)) +
     #geom_histogram(binwidth=1, aes(y=stat(density), group=Wave)) + 
     geom_histogram(binwidth=1, aes(group=Wave)) + 
     facet_grid(. ~ Wave) +
     theme_bw() +labs("Weight (kg)", y="Frequency")

5.4.4.6 Height across baseline waves

 sharew1.baseline %>% 
     ggplot(aes(x=height)) +
     #geom_histogram(binwidth=1, aes(y=stat(density), group=Wave)) + 
     geom_histogram(binwidth=1, aes(group=Wave)) + 
     facet_grid(. ~ Wave) +
     theme_bw() +labs("Height (cm)", y="Frequency")

5.4.4.7 Grip strength across baseline waves

 sharew1.baseline %>% 
     ggplot(aes(x=maxgrip)) +
     #geom_histogram(binwidth=1, aes(y=stat(density), group=Wave)) + 
     geom_histogram(binwidth=1, aes(group=Wave)) + 
     facet_grid(. ~ Wave) +
     theme_bw() +labs("Grip strength (kg)", y="Frequency")

5.5 Longitudinal aspects

5.5.1 Outcome variable - Profiles (L1)

Here the aim is to visualize the individual profiles of the outcome for the participants.

The number of subjects is very large and profile plots of grip strength are not clearly conveying the information about individual variability. To visualize effectively the profile plots we use different strategies: we use selected subgroups of participants (100 per group, stratifying the plots by sex and age groups), and different time metrics (age or measurement occasion). Interactive plots are also available (see the separate output page devoted to interactive plots).

The graphs that use age as a time metric give an idea of the shape of trajectory for model specification (which has to be determined a priori), those based on measurement occasion give a clearer overview of the individual trajectories, as participants enter the study at different ages. Even though age was included as a continuous time metric in the analysis strategy, a summary stratified by ten-year groups can serve as a quick overview of the longitudinal trends by age.

5.5.1.1 Age as time metric

Overall, the profile plots highlight the trend towards diminishing grip strength with age and the rate of change seems to accelerate over age (the slope at later ages is bigger than at the beginning). Older participants are followed up for shorter times, substantial increases or decreases in grip strength bewtween measurements are possible. The variability of the outcome tends to decrease at later measurement occasions, especially in the older age groups.

# data preparation for the plotting
# define the quantiles of the initial grip stregth
set.seed(1234)

sharew.mo <- sharew.mo %>% group_by(gender,
                                    cut(
                                      age_int.M1,
                                      c(50, 60, 70, 80, max(age_int.M1, na.rm = TRUE) + 1),
                                      right = FALSE,
                                      labels = c("50-59", "60-69", "70-80", "80+")
                                    )) %>% mutate(
                                      percMaxgrip1AgeGroup = rank(maxgrip.M1) / length(maxgrip.M1),
                                      rankMaxgrip1AgeGroup = rank(maxgrip.M1, ties.method =
                                                                    "random")
                                    )


# percentile of maxgrip at baseline by sex/age group
share1 = share1 %>% mutate(percMaxgrip1AgeGroup = left_join(share1, sharew.mo, by =
                                                              "mergeid")$percMaxgrip1AgeGroup)
#rank of maxgrip at baseline by sex/age group
share1 = share1 %>% mutate(rankMaxgrip1AgeGroup = left_join(share1, sharew.mo, by =
                                                              "mergeid")$rankMaxgrip1AgeGroup)



sharew.mo <- sharew.mo %>% group_by(gender) %>% mutate(percMaxgrip1Sex = rank(maxgrip.M1) /
                                                         length(maxgrip.M1))

#percentile by sex
sharew.mo <-
  sharew.mo %>% group_by(gender) %>% mutate(percMaxgrip1Sex = rank(maxgrip.M1) /
                                              length(maxgrip.M1))
share1 = share1 %>% mutate(percMaxgrip1Sex = left_join(share1, sharew.mo, by =
                                                         "mergeid")$percMaxgrip1Sex)

All profiles

# males

g1 <-
  share1[share1$gender == "Male", ] %>% ggplot(aes(age_int, maxgrip, group =
                                                     mergeid, color = percMaxgrip1Sex)) + geom_line(alpha = 0.2) + theme_bw()  +
  labs(
    x = "Age at interview",
    y = "Grip strength (kg)",
    color = "Initial grip strength (quantile)",
    title = "Males"
  ) + theme(legend.position = "bottom")
#geom_smooth(color="blue",  aes(age_int, maxgrip))

g1

ggsave(
  "figures/profilesAgeDenmarkMales.png" ,
  device = "png",
  scale = 1,
  width = 12,
  height = 8 ,
  units = "in",
  pointsize = 12
)
# females

g2 <-
  share1[share1$gender == "Female", ] %>% ggplot(aes(age_int, maxgrip, group =
                                                       mergeid, color = percMaxgrip1Sex)) + geom_line(alpha = 0.2) + theme_bw()  +
  labs(
    x = "Age at interview",
    y = "Grip strength (kg)",
    color = "Initial grip strength (quantile)",
    title = "Females"
  ) + theme(legend.position = "bottom")
#geom_smooth(color="blue",  aes(age_int, maxgrip))

g2

ggsave(
  "figures/profilesAgeDenmarkFemales.png" ,
  device = "png",
  scale = 1,
  width = 12,
  height = 8 ,
  units = "in",
  pointsize = 12
)


########## figure 14 for paper
p = g1 + g2

ggsave(
  "figuresSubmission/Fig14.tif" ,
  scale = 1,
  width = 12,
  height = 8 ,
  units = "in",
  pointsize = 12,
  device = "tiff",
  compression = "lzw",
  plot = p
)

Subsets of profiles

Here we display the profiles of approximately 400 individuals for each sex group.

df <-
  cbind.data.frame(
    id = sharew1.baseline$mergeid,
    mg.occasion,
    age.occasion,
    gender = sharew1.baseline$gender,
    age_int_cat = sharew1.baseline$age_int_cat,
    age_int = sharew1.baseline$age_int
  )

dimnames(df)[[2]] <-
  c(
    "ID",
    paste0("M", 1:num.waves),
    paste0("MA", 1:num.waves),
    "gender",
    "age_int_cat",
    "age_int"
  )


df <-
  df %>% group_by(gender) %>% mutate(Quantile = rank(M1) / length(M1))


df.melt <-
  reshape2::melt(
    df,
    id.vars = c("ID", "gender", "age_int_cat", "age_int", "Quantile"),
    measure.vars = c("M1", "M2", "M3", "M4", "M5", "M6", "M7")
  )

df.melt2 <-
  reshape2::melt(
    df,
    id.vars = c("ID", "gender", "age_int_cat", "age_int", "Quantile"),
    measure.vars = c("MA1", "MA2", "MA3", "MA4", "MA5", "MA6", "MA7")
  )

df.melt <- data.frame(df.melt, age_var = df.melt2$value)


#index of the observations that represents a quantile
my.q <- seq(0.00001, 1, by = 0.0025)

tmp <- subset(df, gender == "Female" & !is.na(M1))

index.qF <-
  sapply(1:length(my.q), function(i)
    which.is.max(-abs(rank(tmp$M1) / length(tmp$M1) - my.q[i])))




tmp <- subset(df, gender == "Male" & !is.na(M1))

index.qM <-
  sapply(1:length(my.q), function(i)
    which.is.max(-abs(rank(tmp$M1) / length(tmp$M1) - my.q[i])))


df.melt %>% filter(is.element(ID, subset(df, gender == "Male" &
                                           !is.na(M1))$ID[index.qM]) |
                     is.element(ID, subset(df, gender == "Female" &
                                             !is.na(M1))$ID[index.qF])) %>%
  ggplot(aes(age_var, value, group = ID, color = Quantile)) + geom_line() +
  labs(x = "Age at interview", y = "Grip strength (kg)", title = "Males") + theme_bw() + facet_grid(gender ~
                                                                                                      .)

5.5.1.2 Measurement occasion as time metric

Here we show the profile plots by measurement occasion. In our case study the plots based on measurement occasion and stratified by age group. The plots based on subsets are more easily interpretable, also with this time metric.

5.5.1.2.1 All profile plots
# calculates percentile and rank of
set.seed(1234) # set for reproducibility

sharew.mo <- sharew.mo %>% group_by(gender,
                                    cut(
                                      age_int.M1,
                                      c(50, 60, 70, 80, max(age_int.M1, na.rm = TRUE) + 1),
                                      right = FALSE,
                                      labels = c("50-59", "60-69", "70-80", "80+")
                                    )) %>% mutate(
                                      percMaxgrip1AgeGroup = rank(maxgrip.M1) / length(maxgrip.M1),
                                      rankMaxgrip1AgeGroup = rank(maxgrip.M1, ties.method =
                                                                    "random")
                                    )


# percentile of maxgrip at baseline by sex/age group
share1 = share1 %>% mutate(percMaxgrip1AgeGroup = left_join(share1, sharew.mo, by =
                                                              "mergeid")$percMaxgrip1AgeGroup)
#rank of maxgrip at baseline by sex/age group
share1 = share1 %>% mutate(rankMaxgrip1AgeGroup = left_join(share1, sharew.mo, by =
                                                              "mergeid")$rankMaxgrip1AgeGroup)



sharew.mo <- sharew.mo %>% group_by(gender) %>% mutate(percMaxgrip1Sex = rank(maxgrip.M1) /
                                                         length(maxgrip.M1))

#percentile by sex
sharew.mo <-
  sharew.mo %>% group_by(gender) %>% mutate(percMaxgrip1Sex = rank(maxgrip.M1) /
                                              length(maxgrip.M1))
share1 = share1 %>% mutate(percMaxgrip1Sex = left_join(share1, sharew.mo, by =
                                                         "mergeid")$percMaxgrip1Sex)
q.data <-
  sharew.mo %>% select("mergeid",
                       "gender",
                       starts_with("maxgrip"),
                       "age_int.M1",
                       "percMaxgrip1AgeGroup")
q.data$age_int_cat <-
  cut(
    q.data$age_int.M1,
    c(50, 60, 70, 80, max(q.data$age_int.M1, na.rm = TRUE) + 1),
    right = FALSE,
    labels = c("50-59", "60-69", "70-80", "80+")
  )
dimnames(q.data)[[2]] <-
  c(
    "ID",
    "gender",
    "M1",
    "M2",
    "M3",
    "M4",
    "M5",
    "M6",
    "M7",
    "age_int",
    "Quantile",
    "age_int_cat"
  )

df.melt <-
  reshape2::melt(
    q.data,
    id.vars = c("ID", "gender", "age_int_cat", "Quantile"),
    measure.vars = c("M1", "M2", "M3", "M4", "M5",
                     "M6", "M7")
  )



#ggplot(df.melt, aes(variable, value, group=ID, color = Quantile)) + geom_line()+ facet_grid(age_int_cat~gender) +
#  labs(x="Measurement occasion", y="Grip strength (kg)") + theme_bw()


ggplot(df.melt, aes(variable, value, group = ID, color = Quantile)) + geom_line(alpha =
                                                                                  .2) + facet_grid(gender ~ age_int_cat) +
  labs(x = "Measurement occasion", y = "Grip strength (kg)", color = "Initial grip\nstrength (quantile)") + theme_bw()

ggsave(
  "figures/profilesMaxgripAll_v2.png" ,
  device = "png",
  scale = 1,
  width = 12,
  height = 6 ,
  units = "in",
  pointsize = 12
)

5.5.1.3 Subsets of profile plots

Profile plots of grip strength, choosing 100 subjects for each age/sex category with the baseline value of grip strength at a certain quantile of the distribution (100 quantiles 0.00001 to 1, by 0.01). The plot includes only subjects with at least three valid measurements (can be changed). This type of plot substitutes the classical profile plot in this application.

#spaghetti plots based on quantiles

set.seed(123)


df <-
  cbind.data.frame(
    id = sharew1.baseline$mergeid,
    mg.occasion,
    #missing.occasion.5_cv_factors,
    gender = sharew1.baseline$gender,
    age_int_cat = sharew1.baseline$age_int_cat,
    age_int = sharew1.baseline$age_int
  )

dimnames(df)[[2]] <-
  c("ID",
    paste0("M", 1:num.waves),
    #paste0("I", 1:num.waves),
    "gender",
    "age_int_cat",
    "age_int")

df$rank <- sharew.mo$rankMaxgrip1AgeGroup
df$percentile <- sharew.mo$percMaxgrip1AgeGroup

df$use <- FALSE



df[df$gender == "Female" &
     df$age_int_cat == "50-59", ]$use[is.element(df[df$gender == "Female" &
                                                      df$age_int_cat == "50-59", ]$rank,
                                                 round(seq(1, max(df[df$gender == "Female" &
                                                                       df$age_int_cat == "50-59", ]$rank),
                                                           length.out = 100)))] <-
  1



df[df$gender == "Female" &
     df$age_int_cat == "60-69", ]$use[is.element(df[df$gender == "Female" &
                                                      df$age_int_cat == "60-69", ]$rank , round(seq(1, max(df[df$gender == "Female" &
                                                                                                                df$age_int_cat == "60-69", ]$rank), length.out = 100)))] <-
  1

df[df$gender == "Female" &
     df$age_int_cat == "70-80", ]$use[is.element(df[df$gender == "Female" &
                                                      df$age_int_cat == "70-80", ]$rank , round(seq(1, max(df[df$gender == "Female" &
                                                                                                                df$age_int_cat == "70-80", ]$rank), length.out = 100)))] <-
  1

df[df$gender == "Female" &
     df$age_int_cat == "80+", ]$use[is.element(df[df$gender == "Female" &
                                                    df$age_int_cat == "80+", ]$rank , round(seq(1, max(df[df$gender == "Female" &
                                                                                                            df$age_int_cat == "80+", ]$rank), length.out = 100)))] <-
  1

#if it does not select enough, add "random", problem with the ties/ranks.... difficult to overcome with small samples
sum.selected <-
  sum(df[df$gender == "Female" & df$age_int_cat == "80+",]$use)
sum.notselected <-
  sum(df[df$gender == "Female" &
           df$age_int_cat == "80+",]$use == 0)
if (sum.selected < 100)
  df[df$gender == "Female" &
       df$age_int_cat == "80+" &
       df$use == 0,]$use <-
  sample(c(rep(0, sum.notselected - (100 - sum.selected)), rep(1, 100 - sum.selected)))



df[df$gender == "Male" &
     df$age_int_cat == "50-59",]$use[is.element(df[df$gender == "Male" &
                                                     df$age_int_cat == "50-59",]$rank , round(seq(1, max(df[df$gender == "Male" &
                                                                                                              df$age_int_cat == "50-59",]$rank), length.out = 100)))] <-
  1

df[df$gender == "Male" &
     df$age_int_cat == "60-69",]$use[is.element(df[df$gender == "Male" &
                                                     df$age_int_cat == "60-69",]$rank , round(seq(1, max(df[df$gender == "Male" &
                                                                                                              df$age_int_cat == "60-69",]$rank), length.out = 100)))] <-
  1

df[df$gender == "Male" &
     df$age_int_cat == "70-80",]$use[is.element(df[df$gender == "Male" &
                                                     df$age_int_cat == "70-80",]$rank , round(seq(1, max(df[df$gender == "Male" &
                                                                                                              df$age_int_cat == "70-80",]$rank), length.out = 100)))] <-
  1

df[df$gender == "Male" &
     df$age_int_cat == "80+",]$use[is.element(df[df$gender == "Male" &
                                                   df$age_int_cat == "80+",]$rank , round(seq(1, max(df[df$gender == "Male" &
                                                                                                          df$age_int_cat == "80+",]$rank), length.out = 100)))] <-
  1

#if it does not select enough, add "random", problem with the ties/ranks.... difficult to overcome with small samples
sum.selected <-
  sum(df[df$gender == "Male" & df$age_int_cat == "80+",]$use)
sum.notselected <-
  sum(df[df$gender == "Male" & df$age_int_cat == "80+",]$use == 0)
if (sum.selected < 100)
  df[df$gender == "Male" &
       df$age_int_cat == "80+" &
       df$use == 0,]$use <-
  sample(c(rep(0, sum.notselected - (100 - sum.selected)), rep(1, 100 - sum.selected)))

df <- df[df$use == 1,]
#subset to only some percentiles in each subgroup


#matplot(t(tmp[index.q,1:7]), type="l")

df.melt <-
  reshape2::melt(
    df,
    id.vars = c("ID", "gender", "age_int_cat", "percentile"),
    measure.vars = c("M1", "M2", "M3", "M4", "M5",
                     "M6", "M7")
  )



#ggplot(df.melt, aes(variable, value, group=ID, color = Quantile)) + geom_line()+ facet_grid(age_int_cat~gender) +
#  labs(x="Measurement occasion", y="Grip strength (kg)") + theme_bw()


ggplot(df.melt, aes(variable, value, group = ID, color = percentile)) + geom_line(alpha =
                                                                                    .7) + facet_grid(gender ~ age_int_cat) +
  labs(x = "Measurement occasion", y = "Grip strength (kg)", color = "Initial grip\nstrength (quantile)") + theme_bw()

ggsave(
  "figures/profilesMaxgripQuantiles_v3.png" ,
  device = "png",
  width = 16,
  height = 8 ,
  units = "in",
  pointsize = 12,
  scale = .9
)


### figure 13 for paper

ggsave(
  "figuresSubmission/fig13.tif" ,
  device = "tiff",
  width = 16,
  height = 8 ,
  units = "in",
  pointsize = 12,
  scale = .9,
  compression = "lzw"
)


###############

We also show the profiles of the participants with complete follow-up (7 measurements)

#spaghetti plots based on quantiles
df <- cbind.data.frame(id=sharew1.baseline$mergeid, mg.occasion, #missing.occasion.5_cv_factors, 
                       gender=sharew1.baseline$gender, age_int_cat=sharew1.baseline$age_int_cat, age_int=sharew1.baseline$age_int)

dimnames(df)[[2]] <- c("ID", paste0("M", 1:num.waves), #paste0("I", 1:num.waves), 
                       "gender", "age_int_cat", "age_int")

#select only the individuals with at least 3 observations
which.use <- df %>% select(starts_with("M")) %>% apply(1, function(x) sum(!is.na(x))==7)
df <- df[which.use, ]



#index of the observations that represents a quantile
#my.q <- seq(0, 1, by=0.02)
my.q <- seq(0.00001, 1, by=0.01)



tmp <- subset(df, gender=="Female" & age_int_cat=="50-59" & !is.na(M1)); 

index.q <- 
  sapply(1:length(my.q), function(i) which.is.max(-abs(rank(tmp$M1)/length(tmp$M1)-my.q[i])))

q.data <- cbind.data.frame(tmp[index.q,], Quantile=my.q)


tmp <- subset(df, gender=="Female" & age_int_cat=="60-69" & !is.na(M1)); 

index.q <- 
  sapply(1:length(my.q), function(i) which.is.max(-abs(rank(tmp$M1)/length(tmp$M1)-my.q[i])))

q.data2 <- cbind.data.frame(tmp[index.q,], Quantile=my.q)
q.data <- rbind.data.frame(q.data, q.data2)



tmp <- subset(df, gender=="Female" & age_int_cat=="70-80" & !is.na(M1)); 

index.q <- 
  sapply(1:length(my.q), function(i) which.is.max(-abs(rank(tmp$M1)/length(tmp$M1)-my.q[i])))

q.data2 <- cbind.data.frame(tmp[index.q,], Quantile=my.q)
q.data <- rbind.data.frame(q.data, q.data2)


tmp <- subset(df, gender=="Male" & age_int_cat=="50-59" & !is.na(M1)); 

index.q <- 
  sapply(1:length(my.q), function(i) which.is.max(-abs(rank(tmp$M1)/length(tmp$M1)-my.q[i])))

q.data2 <- cbind.data.frame(tmp[index.q,], Quantile=my.q)
q.data <- rbind.data.frame(q.data, q.data2)


tmp <- subset(df, gender=="Male" & age_int_cat=="60-69" & !is.na(M1)); 

index.q <- 
  sapply(1:length(my.q), function(i) which.is.max(-abs(rank(tmp$M1)/length(tmp$M1)-my.q[i])))

q.data2 <- cbind.data.frame(tmp[index.q,], Quantile=my.q)
q.data <- rbind.data.frame(q.data, q.data2)



tmp <- subset(df, gender=="Male" & age_int_cat=="70-80" & !is.na(M1)); 

index.q <- 
  sapply(1:length(my.q), function(i) which.is.max(-abs(rank(tmp$M1)/length(tmp$M1)-my.q[i])))

q.data2 <- cbind.data.frame(tmp[index.q,], Quantile=my.q)
q.data <- rbind.data.frame(q.data, q.data2)




#matplot(t(tmp[index.q,1:7]), type="l")

df.melt <- reshape2::melt(q.data, id.vars=c("ID", "gender", "age_int_cat", "Quantile"), measure.vars= c("M1", "M2", "M3", "M4", "M5", 
                                                                                "M6", "M7"))



#ggplot(df.melt, aes(variable, value, group=ID, color = Quantile)) + geom_line(alpha=.5)+ facet_grid(age_int_cat~gender) + 
#  labs(x="Measurement occasion", y="Grip strength (kg)") + theme_bw() 


#ggsave("figures/profilesMaxgripQuantilesNoNA.png" , device="png", scale=1, width = 8, height= 8 , units="in", pointsize=12)
ggplot(subset(df.melt, gender=="Female"), aes(variable, value, group=ID, color = Quantile)) + geom_line()+ facet_grid(.~age_int_cat) + 
    labs(x="Measurement occasion", y="Grip strength (kg)", title = "Females") + theme_bw()

ggplot(subset(df.melt, gender=="Male"), aes(variable, value, group=ID, color = Quantile)) + geom_line()+ facet_grid(.~age_int_cat) + 
    labs(x="Measurement occasion", y="Grip strength (kg)", title = "Males") + theme_bw()

5.5.3 Outcome variable - Correlation and variability (L3)

Here we used complete pairs of observations and use Pearson correlation to quantify the correlation between measurements taken in different waves/at different measurement occasions. The correlations are evaluated separately for males and females.

The following explorations evaluate the correlations using waves, measurement occasions, time since baseline, and age as time metrics.

These explorations can be useful for determining the characteristics of the outcome based on different time metrics. Using waves we can identify some systematic errors due to wave, while measurement occasion/age is more directly related to the research question (decline of grip strength in time/with age).

The variability of the outcome at different ages is explored only for age as a time metric.

5.5.3.1 Wave as time metric

#setting the breaks for the heatmaps, to make colors comparable
Breaks <- seq(0, 1, by = .01)


#matrix with Wave (names of the waves if not missing, NA otherwise)
Wave.all <- select(sharew1, starts_with("Wave"))

#matrix with the maxgrip, NA if missing
mg <-  select(sharew1, starts_with("maxgrip"))
wavepart <-
  apply(mg, 1, function(x)
    paste0(c(1, 2, 3, 4, 5, 6, 7)[!is.na(x)], collapse = "")) #wideformat


#maxgrip defined by occasion of measurement
#mg.occasion <-matrix(NA, ncol=num.waves, nrow=num.obs.w1)

#for(i in 1:num.waves)
#  mg.occasion[,i] <- cbind.data.frame(mg, NA, NA, NA, NA, NA, NA, NA)[cbind(seq_along(baseline.wave.col), baseline.wave.col+i-1)]
#dimnames(mg.occasion)[[2]] <- paste0("M" , 1:num.waves)

missing.occasion.5_cv_factors <-
  apply(missing.occasion.5_cv, 2, function(x)
    factor(
      x,
      levels = c(-1000,-100,-12,-11, 1),
      labels = c(
        "Out-of-sample",
        "Death",
        "Lost to FU",
        "Intermittent NA",
        "Interview"
      )
    ))


df <-
  cbind.data.frame(
    id = sharew1.baseline$mergeid,
    mg.occasion,
    missing.occasion.5_cv_factors,
    gender = sharew1.baseline$gender,
    age_int_cat = sharew1.baseline$age_int_cat,
    age_int = sharew1.baseline$age_int
  )

dimnames(df)[[2]] <-
  c(
    "ID",
    paste0("M", 1:num.waves),
    paste0("I", 1:num.waves),
    "gender",
    "age_int_cat",
    "age_int"
  )

df.melt  <- reshape2::melt(df)


my.mat <- round(cor(mg, use = "p"), 2)
dimnames(my.mat)[[1]] <-
  dimnames(my.mat)[[2]] <- paste("Wave", 1:num.waves)

#kable(my.mat, caption = "Correlation of grip strength across waves") %>% kable_styling()

my_palette <-
  c(colorRampPalette(rev(
    RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")
  ))(length(Breaks) - 2) , "grey80", "grey80")

The correlations are slightly lower for females compared to males. It is interesting to note that the decrease in measurements taken further apart decreases more substantially, if sexes are analyzed separately.

pheatmap(
  cor(mg[sharew1.baseline$gender == "Male", ], use = "p"),
  cluster_cols = FALSE,
  cluster_rows = FALSE,
  display_numbers = TRUE,
  main = "Males",
  breaks = Breaks,
  labels_row = paste("Wave", 1:7),
  labels_col = paste("Wave", 1:7),
  color = my_palette
)

pheatmap(
  cor(mg[sharew1.baseline$gender == "Female", ], use = "p"),
  cluster_cols = FALSE,
  cluster_rows = FALSE,
  display_numbers = TRUE,
  main = "Females",
  breaks = Breaks,
  labels_row = paste("Wave", 1:7),
  labels_col = paste("Wave", 1:7),
  color = my_palette
)

Matrix with correlations (above the diagonal), SD (on the diagonal) and covariances (under the diagonal), females

my.mat <- cor(mg.wave[sharew$gender == "Female", ], use = "p")
diag(my.mat) <-
  apply(mg.wave[sharew$gender == "Female", ], 2, sd, na.rm = TRUE)
my.mat[lower.tri(my.mat)] <-
  cov(mg.wave[sharew$gender == "Female", ], use = "p")[lower.tri(cov(mg.wave[sharew$gender ==
                                                                               "Female", ], use = "p"))]
dimnames(my.mat)[[1]] <-
  dimnames(my.mat)[[2]] <- paste("Wave", 1:num.waves)

kable(my.mat, digits = 2, caption = "Correlation/SD/covariances of grip strength across waves, females") %>% kable_styling()
Correlation/SD/covariances of grip strength across waves, females
Wave 1 Wave 2 Wave 3 Wave 4 Wave 5 Wave 6 Wave 7
Wave 1 7.26 0.76 0.73 0.73 0.73 0.66 0.59
Wave 2 35.01 6.81 0.80 0.80 0.79 0.75 0.72
Wave 3 33.12 35.65 6.88 0.81 0.78 0.75 0.72
Wave 4 31.54 33.31 34.68 6.76 0.85 0.82 0.78
Wave 5 29.07 31.06 31.37 34.68 6.52 0.82 0.81
Wave 6 25.77 30.18 30.80 33.75 33.45 6.64 0.82
Wave 7 22.19 26.97 27.12 29.78 30.37 32.40 6.29

Generalized pairs plot

df <- cbind.data.frame(gender = sharew1.baseline$gender, mg)
dimnames(df)[[2]] <- c("Sex", paste("Wave", 1:7))
GGally::ggpairs(
  df,
  cols = 2:5,
  mapping = ggplot2::aes(colour = Sex),
  upper = list(continuous = wrap("cor", size = 3))
)

ggsave(
  "figures/corrMGWave.png",
  device = "png",
  scale = 1,
  width = 10,
  height = 10
)

######### figure 11 for paper

ggsave(
  "figuresSubmission/Fig11.tif",
  device = "tiff",
  scale = 1,
  width = 10,
  height = 10,
  compression = "lzw",
  dpi = 600
)


#######################

5.5.3.2 Measurement occassion as time metric

Correlation matrix

The variance decreased with measurement occasion, the correlations decreased with larger time lags.

df <- cbind.data.frame(gender = sharew1.baseline$gender, mg.occasion)
GGally::ggpairs(
  df,
  cols = 2:5,
  mapping = ggplot2::aes(colour = gender),
  upper = list(continuous = wrap("cor", size = 3))
)

ggsave(
  "figures/corr.png",
  device = "png",
  scale = 1.5,
  width = 10,
  height = 10
)
pheatmap(
  cor(mg.occasion[sharew1.baseline$gender == "Male", ], use = "p"),
  cluster_cols = FALSE,
  cluster_rows = FALSE,
  display_numbers = TRUE,
  main = "Males",
  breaks = Breaks,
  labels_row = paste0("M", 1:7),
  labels_col = paste0("M", 1:7),
  color = my_palette
)

my.mat <-
  cor(mg.occasion[sharew1.baseline$gender == "Male", ], use = "p")
diag(my.mat) <-
  apply(mg.occasion[sharew1.baseline$gender == "Male", ], 2, sd, na.rm =
          TRUE)
my.mat[lower.tri(my.mat)] <-
  cov(mg.occasion[sharew1.baseline$gender == "Male", ], use = "p")[lower.tri(cov(mg.occasion, use =
                                                                                   "p"))]
dimnames(my.mat)[[1]] <-
  dimnames(my.mat)[[2]] <- paste0("M", 1:num.waves)

pheatmap(
  cor(mg.occasion[sharew1.baseline$gender == "Female", ], use = "p"),
  cluster_cols = FALSE,
  cluster_rows = FALSE,
  display_numbers = TRUE,
  main = "Females",
  labels_row = paste0("M", 1:7),
  labels_col = paste0("M", 1:7),
  color = my_palette,
  breaks = Breaks
)

kable(my.mat, digits = 2, caption = "Correlation/SD/covariances of grip strength across measurement occasions, males") %>% kable_styling()
Correlation/SD/covariances of grip strength across measurement occasions, males
M1 M2 M3 M4 M5 M6 M7
M1 10.3 0.83 0.82 0.79 0.74 0.73 0.71
M2 78.8 9.85 0.86 0.83 0.80 0.78 0.76
M3 75.9 78.06 9.78 0.87 0.84 0.81 0.82
M4 71.1 75.19 82.14 9.66 0.87 0.83 0.85
M5 66.0 70.77 77.77 78.94 9.81 0.86 0.87
M6 59.6 64.57 66.46 65.86 70.82 9.28 0.89
M7 57.7 59.17 64.78 67.01 71.27 72.79 9.05
my.mat <-
  cor(mg.occasion[sharew1.baseline$gender == "Female", ], use = "p")
diag(my.mat) <-
  apply(mg.occasion[sharew1.baseline$gender == "Female", ], 2, sd, na.rm =
          TRUE)
my.mat[lower.tri(my.mat)] <-
  cov(mg.occasion[sharew1.baseline$gender == "Female", ], use = "p")[lower.tri(cov(mg.occasion, use =
                                                                                     "p"))]
dimnames(my.mat)[[1]] <-
  dimnames(my.mat)[[2]] <- paste0("M", 1:num.waves)
kable(my.mat, digits = 2, caption = "Correlation/SD/covariances of grip strength across measurement occasions, females") %>% kable_styling()
Correlation/SD/covariances of grip strength across measurement occasions, females
M1 M2 M3 M4 M5 M6 M7
M1 7.01 0.78 0.78 0.77 0.74 0.71 0.59
M2 36.06 6.92 0.81 0.79 0.75 0.74 0.65
M3 33.93 34.21 6.52 0.84 0.80 0.78 0.69
M4 34.18 33.73 34.99 6.45 0.84 0.80 0.70
M5 30.68 30.67 31.56 31.29 6.43 0.85 0.78
M6 27.92 28.56 28.90 28.14 32.61 6.18 0.78
M7 22.19 21.45 23.11 21.59 24.64 26.18 5.92

5.5.3.3 Age as time metric (two-year groups, from 50 years old)

5.5.3.3.1 Correlations

The correlations between measurements obtained at different ages were estimated. Data were grouped in two year categories to obtain bigger groups on which the estimates can be based (the choice reflects the time lag between waves). Only estimates based on at least 20 observations are shown, we use age_int that is an integer value.

mg.age.occasion <- matrix(NA, nrow = nrow(sharew1), ncol = 50)
for (i in 1:nrow(age.occasion))
  for (j in 1:num.waves)
    #mg.age.occasion[i,age.occasion[i,j]-49-(age.occasion[i,j] %% 2)] <- mg.occasion[i,j]
    if ((age.occasion[i, j] > 49 &
         age.occasion[i, j] < 91) & !is.na(age.occasion[i, j]))
      #mg.age.occasion[i,age.occasion[i,j]-49+ age.occasion[i,j]%%2] <- mg.occasion[i,j]
      mg.age.occasion[i, age.occasion[i, j] - 49 + (age.occasion[i, j] -
                                                      1) %% 2] <- mg.occasion[i, j]


mg.age.occasion <- mg.age.occasion[, seq(2, 42, by = 2)]

dimnames(mg.age.occasion)[[2]] <- seq(50, 90, by = 2)


# removed the graphical display of the overall correlation

#pheatmap(cor(mg.age.occasion, use="p"), cluster_cols=FALSE, cluster_rows=FALSE, display_numbers = TRUE, breaks=Breaks, color= my_palette)
png(
  "figures/corrAgeMGFemales.png",
  width = 20,
  height = 20,
  units = "cm",
  res = 600
)

a <- cor(mg.age.occasion[sharew1$gender == "Female", ], use = "p")
a1 <- Hmisc::rcorr(mg.age.occasion[sharew1$gender == "Female", ])
a[a1$n <= 20] <- NA

a.f <-
  pheatmap(
    a ,
    cluster_cols = FALSE,
    cluster_rows = FALSE,
    display_numbers = TRUE,
    breaks = Breaks ,
    color = my_palette,
    main = "Females"
  )

a.f

graphics.off()

pheatmap(
  a ,
  cluster_cols = FALSE,
  cluster_rows = FALSE,
  display_numbers = TRUE,
  breaks = Breaks ,
  color = my_palette,
  main = "Females"
)


#ggsave("figures/corrAgeMGFemales.png", width=10, height=10)


png(
  "figures/corrAgeMGMales.png",
  width = 20,
  height = 20,
  units = "cm",
  res = 600
)

a <- cor(mg.age.occasion[sharew1$gender == "Male", ], use = "p")
a1 <- Hmisc::rcorr(mg.age.occasion[sharew1$gender == "Male", ])
a[a1$n <= 20] <- NA

a.m <-
  pheatmap(
    a ,
    cluster_cols = FALSE,
    cluster_rows = FALSE,
    display_numbers = TRUE,
    breaks = Breaks ,
    color = my_palette,
    main = "Males"
  )
a.m
graphics.off()


pheatmap(
  a ,
  cluster_cols = FALSE,
  cluster_rows = FALSE,
  display_numbers = TRUE,
  breaks = Breaks ,
  color = my_palette,
  main = "Males"
)



###########figure 15 for paper
############ figure 12 for paper
g <- grid.arrange(a.m[[4]], a.f[[4]], nrow = 1)
ggsave(
  "FiguresSubmission/fig15.tif",
  scale = 1.2,
  width = 10,
  height = 5,
  compression = "lzw",
  dpi = 600,
  device = "tiff",
  plot = g
)
############

rm(a.m, a.f)

The correlation of grip strength between measurements taken at different ages, indicated that the serial correlation was very high, generally above 0.70 for two-year periods and reduced slightly with the distance; correlations were generally slightly larger for males than for females.

As an additional display, see below the scatterplots of the values of grip strength for the individuals with age grouped in two-year categories, by sex (in the 50-59, 60-69, 70-79, 80-89 age categories).

df <-
  cbind.data.frame(gender = sharew1.baseline$gender, mg.age.occasion)[, 1:6]
GGally::ggpairs(
  df,
  cols = 2:5,
  mapping = ggplot2::aes(colour = gender),
  upper = list(continuous = wrap("cor", size = 3))
)

df <-
  cbind.data.frame(gender = sharew1.baseline$gender, mg.age.occasion)[, c(1, 7:12)]
GGally::ggpairs(
  df,
  cols = 2:5,
  mapping = ggplot2::aes(colour = gender),
  upper = list(continuous = wrap("cor", size = 3))
)

df <-
  cbind.data.frame(gender = sharew1.baseline$gender, mg.age.occasion)[, c(1, 12:17)]
GGally::ggpairs(
  df,
  cols = 2:5,
  mapping = ggplot2::aes(colour = gender),
  upper = list(continuous = wrap("cor", size = 3))
)

df <-
  cbind.data.frame(gender = sharew1.baseline$gender, mg.age.occasion)[, c(1, 16:21)]
GGally::ggpairs(
  df,
  cols = 2:5,
  mapping = ggplot2::aes(colour = gender),
  upper = list(continuous = wrap("cor", size = 3))
)

5.5.4 Variability

The graph previously displayed in the longitudinal trends (L2) domain can be used also to assess how the variability of the measurements varies with age - for example, to identify possible problems with the hypothesis of constant variance.

The graph below show in a single graph the average, standard deviation and coefficient of variation of the outcome, grouping the participants in two-year groups. The SD decreases with age, as does the mean, while the CV increaes.

tmp <-
  share1 %>% group_by(gender, age_int = cut(
    share1$age_int,
    breaks = seq(49, 102, by = 2),
    labels = seq(50, 100, by = 2)
  )) %>% dplyr::summarise(
    Mean = mean(maxgrip, na.rm = TRUE),
    SD = sd(maxgrip, na.rm = TRUE),
    CV = SD / Mean
  )
tmp <-
  cbind.data.frame(tmp, ID = 1:nrow(tmp)) #%>% mutate(tmp, CV=SD/Mean)
tmp <- reshape2::melt(tmp,
                      id.vars = c(1, 2, 6),
                      measure.vars = c(3:4, 5))
ggplot(tmp, aes(x = (age_int), y = value)) + geom_point() + geom_smooth() + theme_bw() +  facet_wrap(gender ~
                                                                                                       variable, scale = "free") + labs(x = "Age", y = "Grip strength (mean, SD or cv)")

Note that these graphs are produced using all the longitudinal data, the findings based on the baseline data are similar (trends in SD are less visible at older ages - due to smaller sample sizes?)

tmp <-
  sharew1.baseline %>% group_by(gender,
                                age_int = cut(
                                  sharew1.baseline$age_int,
                                  breaks = seq(49, 100, by = 2),
                                  labels = seq(50, 98, by = 2)
                                )) %>% dplyr::summarise(
                                  Mean = mean(maxgrip, na.rm = TRUE),
                                  SD = sd(maxgrip, na.rm = TRUE),
                                  CV = SD / Mean
                                )
tmp <-
  cbind.data.frame(tmp, ID = 1:nrow(tmp)) #%>% mutate(tmp, CV=SD/Mean)
tmp <- reshape2::melt(tmp,
                      id.vars = c(1, 2, 6),
                      measure.vars = c(3:4, 5))
ggplot(tmp, aes(x = (age_int), y = value)) + geom_point() + geom_smooth() + theme_bw() +  facet_wrap(gender ~
                                                                                                       variable, scale = "free") + labs(x = "Age", y = "Grip strength (mean, SD or cv)")

#plots figures to save
g <- ggplot(share1, aes(x = (age_int), y = maxgrip)) +  geom_smooth() +
  theme_bw() + facet_wrap(gender ~ .) + labs(x = "Age", y = "Grip strength (kg)")
ggsave(
  "FIgures/maxgripMean.tiff",
  compression = "lzw",
  scale = .5,
  width = 10,
  height = 3,
  plot = g
)
tmp <-
  share1 %>% group_by(gender, age_int) %>% dplyr::summarise(sd.maxgrip =
                                                              sd(maxgrip, na.rm = TRUE))
g <-
  ggplot(tmp, aes(x = (age_int), y = sd.maxgrip)) + geom_point() + geom_smooth() +
  theme_bw() + facet_wrap(gender ~ .) + labs(x = "Age", y = "SD of maxgrip")
ggsave(
  "FIgures/maxgripSD.tiff",
  compression = "lzw",
  scale = .5,
  width = 10,
  height = 3,
  plot = g
)

5.5.6 Evaluation of possible age-cohort effects (LE1)

Here we explore the possible effect of birth cohort in LE1. The following graphs show the smoothed association between age and grip strength, evaluated using baseline measurements (blue), all longitudinal data (f), age-cohort trajectories (red lines, grouping participants in 5 year groups based on their age at baseline). The graphs are shown separately for males and females.

5.5.6.1 Overall description

Here we define the birth cohort variable that will be used to explore the possible presence of cohort effects in some of the characteristics of the participants. Participants are grouped in 10 year groups, except for the older cohort (including 19 years because of small sample size). There is a strong association between age and birth cohort due to the design of the study. The association is present analyzing all data (first graph) or just the baseline interview (second graph)

share1 <- share1 %>% mutate(`Cohort`=cut(share1$yrbirth, breaks = c(1906, 1925, 1935, 1945, 1955, 1965)))
ggplot(share1, aes(age_int)) + geom_histogram(binwidth = 1,
                                              col = "black",
                                              fill = "white") + facet_wrap(Cohort ~ ., ncol = 5) + theme_bw() +
  labs(x = "Age at interview", y = "Number of interviews")

ggsave(
  "Figures/CohortAge_Hist.png",
  width = 15,
  height = 5,
  scale = 0.8
)


ggplot(share1  %>% filter(as.character(firstwave) == as.character(Wave)), aes(age_int)) +
  geom_histogram(binwidth = 1,
                 col = "black",
                 fill = "white") + facet_wrap(Cohort ~ ., ncol = 5) + theme_bw() + 
              labs(x ="Age at baseline interview")

ggplot(share1, aes(factor(Wave_num), age_int)) + geom_boxplot(binwidth = 1,
                                                              col = "black",
                                                              fill = "white") + facet_grid(. ~ Cohort) + theme_bw() + labs(x = "Wave at interview", y =
                                                                                                                             "Age at interview")

ggsave(
  "Figures/CohortAge_Boxplot.png",
  width = 15,
  height = 4,
  scale = 0.8
)

5.5.6.2 Association of birth cohort with outcome

The following graphs show the smoothed association between age and grip strength, evaluated using baseline measurements (black solid line), all longitudinal data (black dashed line), year-of-birth-cohort trajectories (colored lines described in the legend, grouping participants in 5 year groups based on their year of birth, larger grouping is used for extreme years where less participants were included). The graphs are shown separately for males and females.

There is a clear birth cohort effect

share1[share1$gender == "Male", ] %>% mutate(`Cohort` = cut(
  share1$yrbirth[share1$gender == "Male"],
  breaks = c(1906, 1925, 1935, 1940, 1945, 1950, 1955, 1965)
)) %>% ggplot(aes(age_int, maxgrip)) +
  #geom_point(alpha=0.01, data=subset(share1[share1$gender=="Male",], measurement_occasion==1)) +
  theme_bw() +  #facet_wrap(country ~. ) +
  # longitudinal
  geom_smooth(
    color = "black",
    aes(age_int, maxgrip),
    linetype = 2,
    se = FALSE
  ) +
  #cross-sectional
  geom_smooth(
    color = "black",
    aes(age_int, maxgrip),
    se = FALSE,
    data = subset(share1[share1$gender == "Male", ], measurement_occasion ==
                    1),
    size = 2
  ) +  # + facet_wrap(.~country)
  geom_smooth(aes(age_int, maxgrip, group = Cohort, color = Cohort),
              alpha = .1) +
  theme_bw()  + labs(title = "Males", x = "Age", y = "Grip strength (kg)")

ggsave(
  "figures/AgeCohortBirthMales.png" ,
  device = "png",
  scale = 1,
  width = 15,
  height = 8 ,
  units = "in",
  pointsize = 12
)




share1[share1$gender == "Female", ] %>% mutate(`Cohort` = cut(yrbirth, breaks = c(
  1906, 1925, 1935, 1940, 1945, 1950, 1955, 1965
))) %>% ggplot(aes(age_int, maxgrip)) +
  #geom_point(alpha=0.01, data=subset(share1[share1$gender=="Male",], measurement_occasion==1)) +
  theme_bw() +  #facet_wrap(country ~. ) +
  # longitudinal
  geom_smooth(
    color = "black",
    aes(age_int, maxgrip),
    linetype = 2,
    se = FALSE
  ) +
  #cross-sectional
  geom_smooth(
    color = "black",
    aes(age_int, maxgrip),
    se = FALSE,
    data = subset(share1[share1$gender == "Female", ], measurement_occasion ==
                    1),
    size = 2
  ) +  # + facet_wrap(.~country)
  geom_smooth(aes(age_int, maxgrip, group = Cohort, color = Cohort),
              alpha = .1) +
  theme_bw()  + labs(title = "Females", x = "Age", y = "Grip strength (kg)")

ggsave(
  "figures/AgeCohortBirthFemales.png" ,
  device = "png",
  scale = 1,
  width = 15,
  height = 8 ,
  units = "in",
  pointsize = 12
)


ggsave(
  "figuresSubmission/fig16.tif" ,
  device = "tiff",
  scale = 1,
  width = 15,
  height = 8 ,
  units = "in",
  pointsize = 12,
  compression = "lzw"
)

In a similar way we also explored the longitudinal age effect grouping the participants that belonged to the same age group, defined in 5-year groups.

ggplot(share1[share1$gender == "Male", ],  aes(age_int, maxgrip)) +
  #geom_point(alpha=0.01, data=subset(share1[share1$gender=="Male",], measurement_occasion==1)) +
  theme_bw() +  #facet_wrap(country ~. ) +
  # longitudinal
  geom_smooth(
    color = "black",
    aes(age_int, maxgrip),
    linetype = 2,
    data = subset(share1[share1$gender == "Male", ]),
    se = FALSE
  ) +
  #cross-sectional
  geom_smooth(
    color = "black",
    aes(age_int, maxgrip),
    se = FALSE,
    data = subset(share1[share1$gender == "Male", ], measurement_occasion ==
                    1),
    size = 2
  ) +  # + facet_wrap(.~country) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(share1[share1$gender == "Male", ], share1[share1$gender ==
                                                              "Male", ]$age_int_baseline < 55),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Male", ],
      share1[share1$gender == "Male", ]$age_int_baseline >= 55 &
        share1[share1$gender == "Male", ]$age_int_baseline < 60
    ),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Male", ],
      share1[share1$gender == "Male", ]$age_int_baseline >= 60 &
        share1[share1$gender == "Male", ]$age_int_baseline < 65
    ),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Male", ],
      share1[share1$gender == "Male", ]$age_int_baseline >= 65 &
        share1[share1$gender == "Male", ]$age_int_baseline < 70
    ),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Male", ],
      share1[share1$gender == "Male", ]$age_int_baseline >= 70 &
        share1[share1$gender == "Male", ]$age_int_baseline < 75
    ),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Male", ],
      share1[share1$gender == "Male", ]$age_int_baseline >= 75 &
        share1[share1$gender == "Male", ]$age_int_baseline <= 80
    ),
    alpha = .1
  ) + theme_bw()  + labs(title = "Males", x = "Age", y = "Grip strength (kg)")

ggsave(
  "figures/AgeCohortMales.png" ,
  device = "png",
  scale = 1,
  width = 12,
  height = 8 ,
  units = "in",
  pointsize = 12
)





ggplot(share1[share1$gender == "Female", ],  aes(age_int, maxgrip)) +
  #geom_point(alpha=0.01, data=subset(share1[share1$gender=="Female",], measurement_occasion==1)) +
  theme_bw() +  #facet_wrap(country ~. ) +
  geom_smooth(
    color = "black",
    aes(age_int, maxgrip),
    linetype = 2,
    data = subset(share1[share1$gender == "Female", ]),
    size = 2,
    se = FALSE
  ) +
  geom_smooth(
    color = "black",
    size = 2,
    aes(age_int, maxgrip),
    se = FALSE,
    data = subset(share1[share1$gender == "Female", ], measurement_occasion ==
                    1)
  ) +  # + facet_wrap(.~country) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(share1[share1$gender == "Female", ], share1[share1$gender ==
                                                                "Female", ]$age_int_baseline < 55),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Female", ],
      share1[share1$gender == "Female", ]$age_int_baseline >= 55 &
        share1[share1$gender == "Female", ]$age_int_baseline < 60
    ),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Female", ],
      share1[share1$gender == "Female", ]$age_int_baseline >= 60 &
        share1[share1$gender == "Female", ]$age_int_baseline < 65
    ),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Female", ],
      share1[share1$gender == "Female", ]$age_int_baseline >= 65 &
        share1[share1$gender == "Female", ]$age_int_baseline < 70
    ),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Female", ],
      share1[share1$gender == "Female", ]$age_int_baseline >= 70 &
        share1[share1$gender == "Female", ]$age_int_baseline < 75
    ),
    alpha = .1
  ) +
  geom_smooth(
    color = "red",
    aes(age_int, maxgrip),
    data = subset(
      share1[share1$gender == "Female", ],
      share1[share1$gender == "Female", ]$age_int_baseline >= 75 &
        share1[share1$gender == "Female", ]$age_int_baseline <= 80
    ),
    alpha = .1
  )  + labs(title = "Females", x = "Age", y = "Grip strength (kg)") #+

ggsave(
  "figures/AgeCohortFemales.png" ,
  device = "png",
  scale = 1,
  width = 12,
  height = 8 ,
  units = "in",
  pointsize = 12
)

5.5.6.3 Association of birth cohort and physical activity

When the summaries of physical activity is stratified by birth cohort we observe that there is not much decline with age for the younger cohorts, while the decline is very steep for the oldest cohort (that is including all the oldest participants). The cohorts differ in their engagement in vigorous PA. Among females the effect is different.

share1[share1$gender == "Male", ] %>% mutate(`Cohort` = cut(
  share1$yrbirth[share1$gender == "Male"],
  breaks = c(1906, 1925, 1935, 1940, 1945, 1950, 1955, 1965)
)) %>%
  filter(!is.na(pa_low_freq)) %>%
  ggplot(aes(age_int, ifelse(pa_low_freq == 1, 1, 0))) +
  #geom_point(alpha=0.01, data=subset(share1[share1$gender=="Male",], measurement_occasion==1)) +
  theme_bw() +  #facet_wrap(country ~. ) +
  # longitudinal
  geom_smooth(color = "black",
              linetype = 2,
              se = FALSE) +
  #cross-sectional
  geom_smooth(
    color = "black",
    ,
    se = FALSE,
    data = subset(share1[share1$gender == "Male" &
                           !is.na(share1$pa_vig_freq), ], measurement_occasion == 1),
    size = 2
  ) +  # + facet_wrap(.~country)
  geom_smooth(aes(
    age_int,
    ifelse(pa_low_freq == 1, 1, 0),
    group = Cohort,
    color = Cohort
  ),
  alpha = .1) +
  theme_bw()  + labs(title = "Males", x = "Age", y = "Probability of moderate PA")

share1[share1$gender == "Female", ] %>% mutate(`Cohort` = cut(
  share1$yrbirth[share1$gender == "Female"],
  breaks = c(1906, 1925, 1935, 1940, 1945, 1950, 1955, 1965)
)) %>%
  filter(!is.na(pa_low_freq)) %>%
  ggplot(aes(age_int, ifelse(pa_low_freq == 1, 1, 0))) +
  #geom_point(alpha=0.01, data=subset(share1[share1$gender=="Male",], measurement_occasion==1)) +
  theme_bw() +  #facet_wrap(country ~. ) +
  # longitudinal
  geom_smooth(color = "black",
              linetype = 2,
              se = FALSE) +
  #cross-sectional
  geom_smooth(
    color = "black",
    ,
    se = FALSE,
    data = subset(share1[share1$gender == "Male" &
                           !is.na(share1$pa_low_freq), ], measurement_occasion == 1),
    size = 2
  ) +  # + facet_wrap(.~country)
  geom_smooth(aes(
    age_int,
    ifelse(pa_low_freq == 1, 1, 0),
    group = Cohort,
    color = Cohort
  ),
  alpha = .1) +
  theme_bw()  + labs(title = "Females", x = "Age", y = "Probability of moderate PA")

When the summary is stratified by birth cohort we observe that there is not much decline with age for the younger participants (belonging to younger birth cohorts), while the decline is very steep for the oldest cohort (that is including all the oldest participants). The cohorts differ in their engagement in vigorous PA. The smoothed estimates by cohort much more variable for females.

Possible explanation for the effect for females

share1[share1$gender == "Male", ] %>% mutate(`Cohort` = cut(
  share1$yrbirth[share1$gender == "Male"],
  breaks = c(1906, 1925, 1935, 1940, 1945, 1950, 1955, 1965)
)) %>%
  filter(!is.na(pa_vig_freq)) %>%
  ggplot(aes(age_int, ifelse(pa_vig_freq == 1, 1, 0))) +
  #geom_point(alpha=0.01, data=subset(share1[share1$gender=="Male",], measurement_occasion==1)) +
  theme_bw() +  #facet_wrap(country ~. ) +
  # longitudinal
  geom_smooth(color = "black",
              linetype = 2,
              se = FALSE) +
  #cross-sectional
  geom_smooth(
    color = "black",
    ,
    se = FALSE,
    data = subset(share1[share1$gender == "Male" &
                           !is.na(share1$pa_vig_freq), ], measurement_occasion == 1),
    size = 2
  ) +  # + facet_wrap(.~country)
  geom_smooth(aes(
    age_int,
    ifelse(pa_vig_freq == 1, 1, 0),
    group = Cohort,
    color = Cohort
  ),
  alpha = .1) +
  theme_bw()  + labs(title = "Males", x = "Age", y = "Probability of vigorous PA")

ggsave(
  "figures/PAVigAgeCohortMales.png",
  width = 6,
  height = 6,
  scale = 1.2
)
share1[share1$gender == "Female", ] %>% mutate(`Cohort` = cut(
  share1$yrbirth[share1$gender == "Female"],
  breaks = c(1906, 1925, 1935, 1940, 1945, 1950, 1955, 1965)
)) %>%
  filter(!is.na(pa_vig_freq)) %>%
  ggplot(aes(age_int, ifelse(pa_vig_freq == 1, 1, 0))) +
  #geom_point(alpha=0.01, data=subset(share1[share1$gender=="Male",], measurement_occasion==1)) +
  theme_bw() +  #facet_wrap(country ~. ) +
  # longitudinal
  geom_smooth(color = "black",
              linetype = 2,
              se = FALSE) +
  #cross-sectional
  geom_smooth(
    color = "black",
    ,
    se = FALSE,
    data = subset(share1[share1$gender == "Female" &
                           !is.na(share1$pa_vig_freq), ], measurement_occasion == 1),
    size = 2
  ) +  # + facet_wrap(.~country)
  geom_smooth(aes(
    age_int,
    ifelse(pa_vig_freq == 1, 1, 0),
    group = Cohort,
    color = Cohort
  ),
  alpha = .1) +
  theme_bw()  + labs(title = "Females", x = "Age", y = "Probability of vigorous PA")