Example preprocessing steps with mobile recording data

What about duplicates?

Check mobile recordings dataset by name (audio file):

recordings_duplicates <- mobile_recordings %>%
  count(name) %>% 
  arrange(-n) %>% 
  filter(n > 1) %>% 
  pull(name)

mobile_recordings %>% filter(name %in% recordings_duplicates)
#> # A tibble: 0 × 29
#> # … with 29 variables: patient_id <dbl>, dw_mobilerecording_id <dbl>,
#> #   st_mobilerecording_id <dbl>, dw_patient_id <dbl>, st_patient_id <dbl>,
#> #   create_date_date_id <dbl>, create_date_time_id <dbl>, name <chr>,
#> #   header <chr>, duration <dbl>, is_analyzed <lgl>, is_data_available <lgl>,
#> #   is_outgoing_call <lgl>, chunks_count <dbl>, successfully_recorded <lgl>,
#> #   data <lgl>, is_synced <lgl>, sync_date_date_id <dbl>,
#> #   sync_date_time_id <dbl>, missing_fields_flag <lgl>, …

Check chunks by chunk id and frame no:

chunks_duplicates <- mobile_chunks %>% 
  count(dw_mobilerecordingchunk_id, frame_nr) %>% 
  arrange(-n) %>% 
  filter(n > 1)

if (nrow(chunks_duplicates) > 0) {
  mobile_chunks %>%
    inner_join(chunks_duplicates) %>%
    arrange(dw_mobilerecordingchunk_id, frame_nr)
}

No duplicates in our data. If there were any we should filter them out.

Dates

Data collected before 2018 is unreliable due to a bug in the library that was used to collect all the parameters of a call. We’ll work only with data from 2018.

visits <- visits %>% filter(visit_date >= "2018-01-01")
mobile_recordings <- mobile_recordings %>% filter(create_date >= "2018-01-01")

We don’t filter mobile_chunks as it’s not clear when the data was recorded. Instead, we’ll use corresponding rows from mobile_recordings dataset to get only valid rows by joining the datasets together.

Prepare datasets for further analysis

See the distribution of mobile_recordings$chunks_count values:

mobile_recordings$chunks_count %>% summary()
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>     1.0     1.0     2.0     2.3     3.5     5.0

Recordings with more than 10 chunks are unreliable.

mobile_recordings_filtered <- mobile_recordings %>% filter(chunks_count <= 10)

unreliable_rec_id <- mobile_recordings %>% 
  filter(chunks_count > 10) %>% 
  select(dw_mobilerecording_id, chunks_count)

unreliable_rec_id
#> # A tibble: 0 × 2
#> # … with 2 variables: dw_mobilerecording_id <dbl>, chunks_count <dbl>

Calculate aggregates

sample_call_parameters <- get_sample_call_parameters()

stats <- tryCatch({
  mobile_chunks %>% 
    tidyr::nest(data = -dw_mobilerecording_id) %>% 
    mutate(stats = lapply(data, function(i) {
      i %>%
        select(all_of(sample_call_parameters)) %>% 
        summarise_at(vars(sample_call_parameters), list(mean = mean, sd = sd))
    }))
})

stats <- stats %>% 
  select(dw_mobilerecording_id, stats) %>% 
  tidyr::unnest(cols = c(stats))  %>% 
  select(dw_mobilerecording_id, ends_with("mean"), ends_with("sd"))

stats
#> # A tibble: 10 × 173
#>    dw_mobilere…¹ pcm_L…² pcm_z…³ voice…⁴ f0_sm…⁵ f0env…⁶ pcm_f…⁷ pcm_f…⁸ pcm_f…⁹
#>            <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#>  1         21987   -20.6   0.222   0.541  159.    189.   1.57e-9 4.04e-7    891.
#>  2         21927   -14.0   0.398   0.239    0       0    1.34e-6 1.27e-5   1612.
#>  3         21929   -20.5   0.351   0.221    0       0    7.94e-9 1.10e-7   1535.
#>  4         21940   -19.0   0.335   0.275    7.20    3.56 1.04e-7 1.75e-6   1640.
#>  5         21981   -17.9   0.147   0.458   75.1   138.   5.05e-6 1.91e-5    828.
#>  6         21991   -21.1   0.216   0.200    0       0    5.43e-8 1.68e-7    856.
#>  7         21911   -19.9   0.229   0.371   31.0   134.   1.25e-6 4.68e-6    801.
#>  8         21913   -16.2   0.363   0.298    3.33   25.8  1.40e-6 4.00e-5   1921.
#>  9         21918   -13.2   0.379   0.362   37.1   128.   1.29e-6 1.01e-4   1771.
#> 10         21922   -20.1   0.372   0.214    0       0    6.04e-9 1.51e-7   1603.
#> # … with 164 more variables: pcm_fftMag_spectralRollOff50_0_sma_mean <dbl>,
#> #   pcm_fftMag_spectralRollOff75_0_sma_mean <dbl>,
#> #   pcm_fftMag_spectralRollOff90_0_sma_mean <dbl>,
#> #   pcm_fftmag_spectralflux_sma_mean <dbl>,
#> #   pcm_fftmag_spectralcentroid_sma_mean <dbl>,
#> #   pcm_fftmag_spectralmaxpos_sma_mean <dbl>,
#> #   pcm_fftmag_spectralminpos_sma_mean <dbl>, f0final_sma_mean <dbl>, …

Stats preview

We calculated stiatistics like this (only first 16 rows showed here):

#>  [1] "F0semitoneFrom27_5Hz_sma3nz_mean"   "F0semitoneFrom27_5Hz_sma3nz_sd"    
#>  [3] "alpharatio_sma3_mean"               "alpharatio_sma3_sd"                
#>  [5] "audSpec_Rfilt_sma_compare_0__mean"  "audSpec_Rfilt_sma_compare_0__sd"   
#>  [7] "audSpec_Rfilt_sma_compare_10__mean" "audSpec_Rfilt_sma_compare_10__sd"  
#>  [9] "audSpec_Rfilt_sma_compare_11__mean" "audSpec_Rfilt_sma_compare_11__sd"  
#> [11] "audSpec_Rfilt_sma_compare_12__mean" "audSpec_Rfilt_sma_compare_12__sd"  
#> [13] "audSpec_Rfilt_sma_compare_13__mean" "audSpec_Rfilt_sma_compare_13__sd"  
#> [15] "audSpec_Rfilt_sma_compare_14__mean" "audSpec_Rfilt_sma_compare_14__sd"

For example, for the variable f1frequency_sma3nz statistics:

#> # A tibble: 10 × 2
#>    f1frequency_sma3nz_mean f1frequency_sma3nz_sd
#>                      <dbl>                 <dbl>
#>  1                    644.                  213.
#>  2                    782.                  252.
#>  3                    865.                  302.
#>  4                    738.                  275.
#>  5                    647.                  224.
#>  6                    622.                  238.
#>  7                    644.                  285.
#>  8                    899.                  254.
#>  9                    873.                  245.
#> 10                    807.                  324.