diff --git a/.github/.gitignore b/.github/.gitignore index f920f889..2d19fc76 100644 --- a/.github/.gitignore +++ b/.github/.gitignore @@ -1,2 +1 @@ - *.html diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml deleted file mode 100644 index 38a50449..00000000 --- a/.github/workflows/R-CMD-check.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -on: - push: - branches: [main, DEV] - pull_request: - branches: [main, DEV] - -name: R-CMD-check - -jobs: - R-CMD-check: - runs-on: ${{ matrix.config.os }} - - name: ${{ matrix.config.os }} (${{ matrix.config.r }}) - - strategy: - fail-fast: false - matrix: - config: - - {os: macos-latest, r: 'release'} - - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} - - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - R_KEEP_PKG_SOURCE: yes - - steps: - - uses: actions/checkout@v3 - - - uses: r-lib/actions/setup-pandoc@v2 - - - uses: r-lib/actions/setup-r@v2 - with: - r-version: ${{ matrix.config.r }} - http-user-agent: ${{ matrix.config.http-user-agent }} - use-public-rspm: true - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::rcmdcheck - needs: check - - - uses: r-lib/actions/check-r-package@v2 - with: - upload-snapshots: true diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index a7d8ffd2..bac8513c 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [DEV] + branches: [wbpage_dev] pull_request: - branches: [DEV] + branches: [wbpage_dev] release: types: [published] workflow_dispatch: diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml deleted file mode 100644 index ddf8b7bd..00000000 --- a/.github/workflows/test-coverage.yaml +++ /dev/null @@ -1,50 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -on: - push: - branches: [main, DEV] - pull_request: - branches: [main, DEV] - -name: test-coverage - -jobs: - test-coverage: - runs-on: ubuntu-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - - steps: - - uses: actions/checkout@v3 - - - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::covr - needs: coverage - - - name: Test coverage - run: | - covr::codecov( - quiet = FALSE, - clean = FALSE, - install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") - ) - shell: Rscript {0} - - - name: Show testthat output - if: always() - run: | - ## -------------------------------------------------------------------- - find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true - shell: bash - - - name: Upload test results - if: failure() - uses: actions/upload-artifact@v3 - with: - name: coverage-test-failures - path: ${{ runner.temp }}/package diff --git a/.gitignore b/.gitignore index 061b0855..6247c85b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,4 @@ .RData .Ruserdata -inst/doc -doc -Meta - docs -/doc/ -/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index ea03dbe2..0211bd6c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: joyn Type: Package Title: Tool for Diagnosis of Tables Joins and Complementary Join Features -Version: 0.2.3 +Version: 0.2.3.9000 Authors@R: c(person(given = "R.Andres", family = "Castaneda", email = "acastanedaa@worldbank.org", diff --git a/_pkgdown.yml b/_pkgdown.yml index 04769651..6997f49a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,4 +1,4 @@ -url: https://randrescastaneda.github.io/joyn/ +url: https://randrescastaneda.github.io/joyn/dev/ template: bootstrap: 5 bottswatch: cosmo diff --git a/docs/404.html b/docs/404.html deleted file mode 100644 index 4c817195..00000000 --- a/docs/404.html +++ /dev/null @@ -1,101 +0,0 @@ - - -
- - - - -YEAR: 2021 -COPYRIGHT HOLDER: joyn authors -- -
LICENSE.md
- Copyright (c) 2021 joyn authors
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-vignettes/adv-functionalities.Rmd
- adv-functionalities.Rmd
-
-library(joyn)
-#>
-#> Attaching package: 'joyn'
-#> The following object is masked from 'package:base':
-#>
-#> merge
-library(data.table)
-#> Warning: package 'data.table' was built under R version 4.3.3
-
-x <- data.table(id = c(1, 4, 2, 3, NA),
- t = c(1L, 2L, 1L, 2L, NA),
- country = c(16, 12, 3, NA, 15))
-
-y <- data.table(id = c(1, 2, 5, 6, 3),
- gdp = c(11L, 15L, 20L, 13L, 10L),
- country = 16:20)
This vignette will let you explore some additional features available
-in joyn
, through an example use case.
Suppose you want to join tables x
and y
,
-where the variable country is available in both. You could do
-one of five things:
If you don’t use the argument by
, joyn
will
-consider country and id as key variables by default
-given that they are common between x
and
-y
.
-
-# The variables with the same name, `id` and `country`, are used as key
-# variables.
-
-joyn(x = x,
- y = y)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 4 44.4%
-#> 2 y 4 44.4%
-#> 3 x & y 1 11.1%
-#> 4 total 9 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id and country from id, gdp, and country
-#> id t country gdp .joyn
-#> <num> <int> <num> <int> <fctr>
-#> 1: 1 1 16 11 x & y
-#> 2: 4 2 12 NA x
-#> 3: 2 1 3 NA x
-#> 4: 3 2 NA NA x
-#> 5: NA NA 15 NA x
-#> 6: 2 NA 17 15 y
-#> 7: 5 NA 18 20 y
-#> 8: 6 NA 19 13 y
-#> 9: 3 NA 20 10 y
Alternatively, you can specify to join by country
-
-
-# Joining by country
-
-joyn(x = x,
- y = y,
- by = "country")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 4 44.4%
-#> 2 y 4 44.4%
-#> 3 x & y 1 11.1%
-#> 4 total 9 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables country from id, gdp, and country
-#> id t country gdp .joyn
-#> <num> <int> <num> <int> <fctr>
-#> 1: 1 1 16 11 x & y
-#> 2: 4 2 12 NA x
-#> 3: 2 1 3 NA x
-#> 4: 3 2 NA NA x
-#> 5: NA NA 15 NA x
-#> 6: NA NA 17 15 y
-#> 7: NA NA 18 20 y
-#> 8: NA NA 19 13 y
-#> 9: NA NA 20 10 y
y
and
-don’t bring it into the resulting table
-This the default if you did not include country as part of
-the key variables in argument by
.
-
-joyn(x = x,
- y = y,
- by = "id")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, gdp, and country
-#> id t country gdp .joyn
-#> <num> <int> <num> <int> <fctr>
-#> 1: 1 1 16 11 x & y
-#> 2: 4 2 12 NA x
-#> 3: 2 1 3 15 x & y
-#> 4: 3 2 NA 10 x & y
-#> 5: NA NA 15 NA x
-#> 6: 5 NA NA 20 y
-#> 7: 6 NA NA 13 y
Another possibility is to make use of the update_NAs
-argument of joyn()
. This allows you to update the NAs
-values in variable country in table x
with the
-actual values of the matching observations in country from
-table y. In this case, actual values in country from table x
-will remain unchanged.
-
-joyn(x = x,
- y = y,
- by = "id",
- update_NAs = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 x & y 2 28.6%
-#> 3 NA updated 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, gdp, and country
-#> id t country gdp .joyn
-#> <num> <int> <num> <int> <fctr>
-#> 1: 1 1 16 11 x & y
-#> 2: 4 2 12 NA x
-#> 3: 2 1 3 15 x & y
-#> 4: 3 2 20 10 NA updated
-#> 5: NA NA 15 NA x
-#> 6: 5 NA 18 20 NA updated
-#> 7: 6 NA 19 13 NA updated
You can also update all the values - both NAs and actual - in
-variable country of table x
with the actual values
-of the matching observations in country from y
.
-This is done by setting update_values = TRUE
.
Notice that the reportvar
allows you keep track of how
-the update worked. In this case, value update means that only
-the values that are different between country from
-x
and country from y
are updated.
However, let’s consider other possible cases:
-If, for the same matching observations, the values between the -two country variables were the same, the reporting variable -would report x & y instead (so you know that there is no -update to make).
if there are NAs in country from y
, the
-actual values in x
will be unchanged, and you would see a
-not updated status in the reporting variable. Nevertheless,
-notice there is another way for you to bring country from
-y
to x
. This is done through the argument
-keep_y_in_x
(see 5. below ⬇️)
-
-# Notice that only the value that are
-
-joyn(x = x,
- y = y,
- by = "id",
- update_values = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 NA updated 3 42.9%
-#> 2 value updated 2 28.6%
-#> 3 not updated 2 28.6%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, gdp, and country
-#> id t country gdp .joyn
-#> <num> <int> <num> <int> <fctr>
-#> 1: 1 1 16 11 value updated
-#> 2: 4 2 12 NA not updated
-#> 3: 2 1 17 15 value updated
-#> 4: 3 2 20 10 NA updated
-#> 5: NA NA 15 NA not updated
-#> 6: 5 NA 18 20 NA updated
-#> 7: 6 NA 19 13 NA updated
Another available option is that of bringing the original variable
-country from y
into the resulting table, without
-using it to update the values in x
. In order to distinguish
-country from x
and country from
-y
, joyn
will assign a suffix to the variable’s
-name: so that you will get country.y and country.x.
-All of this can be done specifying
-keep_common_vars = TRUE.
-
-joyn(x = x,
- y = y,
- by = "id",
- keep_common_vars = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, gdp, and country
-#> id t country.x gdp country.y .joyn
-#> <num> <int> <num> <int> <int> <fctr>
-#> 1: 1 1 16 11 16 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 3 15 17 x & y
-#> 4: 3 2 NA 10 20 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 20 18 y
-#> 7: 6 NA NA 13 19 y
In joyn
, you can also bring non common variables from
-y
into the resulting table. In fact you can specify them in
-y_vars_to_keep
, as shown in the example below:
-
-# Keeping variable gdp
-
-joyn(x = x,
- y = y,
- by = "id",
- y_vars_to_keep = "gdp")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> id t country gdp .joyn
-#> <num> <int> <num> <int> <fctr>
-#> 1: 1 1 16 11 x & y
-#> 2: 4 2 12 NA x
-#> 3: 2 1 3 15 x & y
-#> 4: 3 2 NA 10 x & y
-#> 5: NA NA 15 NA x
-#> 6: 5 NA NA 20 y
-#> 7: 6 NA NA 13 y
Notice that if you set y_vars_to_keep = FALSE
or
-y_vars_to_keep = NULL
, then joyn
won’t bring
-any variable into the returning table.
-
-library(joyn)
-#>
-#> Attaching package: 'joyn'
-#> The following object is masked from 'package:base':
-#>
-#> merge
-library(data.table)
-#> Warning: package 'data.table' was built under R version 4.3.3
This vignette will give you a brief overview of how you can use some
-auxiliary functions that joyn
makes available to the
-user.
One of the advantages of joyn
is that you can perform
-one-to-one (1:1), one-to-many (1:m), many-to-one (m:1), and many-to-many
-(m:m) joins. is_id()
is a function that might come in handy
-when you want to check whether your data table is uniquely identified by
-the variables you want to merge by. In fact this is what
-is_id()
checks by default, returning either TRUE or FALSE
-depending on whether the data table is uniquely identified or not.
-Alternatively, you can set return_report = FALSE
to get a
-summary of the duplicates.
-
-x1 <- data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15,
- c = c("a", "b", "a", "t", "d"),
- c1 = c("h", "j", "k", "l", "y"))
-
-y1 <- data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-
-# Checking if x1 is uniquely identified by "id" with return_report = TRUE
-
-is_id(dt = x1,
- by = "id")
-#>
-#> ── Duplicates in terms of `id`
-#> copies n percent
-#> 1 1 3 75%
-#> 2 2 1 25%
-#> 3 total 4 100%
-#> ─────────────────────────────────────────────────────── End of is_id() report ──
-#> [1] FALSE
-
-# Checking duplicates in x1 with return_report = FALSE
-
-is_id(dt = x1,
- by = "id",
- return_report = FALSE)
-#>
-#> ── Duplicates in terms of `id`
-#> copies n percent
-#> 1 1 3 75%
-#> 2 2 1 25%
-#> 3 total 4 100%
-#> ─────────────────────────────────────────────────────── End of is_id() report ──
-#> [1] FALSE
In joyn
, you can also search for variables which
-possibly uniquely identify your data table x
using the
-possible_ids()
function. For example,
-
-# Identify possible unique identifier excluding variable t
-possible_ids(dt = x1,
- exclude = "t")
-#> ✔ There are no duplicates in data frame
-#> → we found 2 possible ids
-#> $V1
-#> [1] "x"
-#>
-#> $V2
-#> [1] "c1"
-
-# Identify possible unique identifier excluding character variables
-possible_ids(dt = x1,
- exclude = "_character")
-#> ✔ There are no duplicates in data frame
-#> → we found 1 possible id
-#> $V1
-#> [1] "x"
-
-# Identify possible unique identifiers, excluding character variables but considering variable z
-possible_ids(dt = x1,
- exclude = "_character",
- include = "z")
-#> ✔ There are no duplicates in data frame
-#> → we found 1 possible id
-#> $V1
-#> [1] "x"
Additionally, joyn
makes available to the user the
-is_balanced()
function. This is instrumental in assessing
-the completeness of the data table within a specified group, i.e., if
-the table contains all the combinations of observations in the group. By
-default, is_balanced()
will tell you if/if not the table is
-balanced. However, if you set return = "table"
, you will
-get a summary of the unbalanced observations. In other words, those
-combinations of elements between the specified variables that is not
-contained in the input table.
-
-# Example with return = "logic", the default
-
-is_balanced(df = x1,
- by = c("id", "t"))
-#> [1] FALSE
-
-# Example with return = "table"
-is_balanced(df = x1,
- by = c("id", "t"),
- return = "table")
-#> id t
-#> 1 3 1
-#> 2 2 2
Furthermore, joyn
provides a function that generates
-simple frequency tables, so that you can easily have an overview of the
-distribution of values within your data tables.
-
-# Tabulating frequencies of var `id`
-
-freq_table(x = x1,
- byvar = "id")[]
-#> id n percent
-#> 1 1 2 40%
-#> 2 2 1 20%
-#> 3 3 1 20%
-#> 4 <NA> 1 20%
-#> 5 total 5 100%
-
-# Removing NAs from the calculation
-
-freq_table(x = x1,
- byvar = "id",
- na.rm = TRUE)[]
-#> id n percent
-#> 1 1 2 50%
-#> 2 2 1 25%
-#> 3 3 1 25%
-#> 4 total 4 100%
Joining data tables with joyn
is particularly convenient
-as it allows you to analyze/be aware of the quality of the merging.
This vignette explores dplyr-like join functions available in
-joyn
. Their major objective is to let you employ a syntax
-you are supposedly already familiar with - the dplyr
one -
-while at the same time benefiting of the additional tools that
-joyn
offers. That is, obtaining additional information and
-verification of the joining.
There are four types of dplyr-like join functions in
-joyn
:
Left joins: joyn::left_join()
Right joins: joyn::right_join()
Full joins: joyn::full_join()
Inner joins: joyn::inner_join()
Each of them is a wrapper that works in a similar way as the
-corresponding dplyr
function.
-
-library(joyn)
-#>
-#> Attaching package: 'joyn'
-#> The following object is masked from 'package:base':
-#>
-#> merge
-library(data.table)
-#> Warning: package 'data.table' was built under R version 4.3.3
-
-x1 <- data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-
-y1 <- data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
Suppose you want to perform a simple left join
-between tables x1
and y1
.
With joyn
you have two possibilities:
using the joyn()
function, specifying
-keep = "left"
using the joyn::left_join()
-function
In addition, you could use dplyr::left_join()
or base R
-merging functions.
Consider these three options:
-
-
-# Option 1
-
-joyn(x = x1,
- y = y1,
- keep = "left",
- match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 y 1 20%
-#> 3 x & y 2 40%
-#> 4 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-
-# Option 2
-
-joyn::left_join(x = x1,
- y = y1,
- relationship = "many-to-one")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 y 1 20%
-#> 3 x & y 2 40%
-#> 4 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> ⚠ Warning: joyn does not currently allow inequality joins, so keep = NULL will
-#> retain only keys in x
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-
-# Option 3
-
-dplyr::left_join(x = x1,
- y = y1,
- relationship = "many-to-one")
-#> Joining with `by = join_by(id)`
-#> id t x y
-#> <num> <int> <int> <num>
-#> 1: 1 1 11 11
-#> 2: 1 2 12 11
-#> 3: 2 1 13 15
-#> 4: 3 2 14 NA
-#> 5: NA NA 15 NA
Comparing the results, the same returning data table is produced.
-However, joyn::left_join()
allows you to enjoy both the
-intuitive syntax from dplyr
and the additional tools from
-joyn
. These include additional options to customize how the
-join is performed, the availability of the joyn report, messages
-informing you on time of execution and the status of the join as well as
-the execution of various checks during the merging. (For additional
-information on each of these joyn
’s features, please take a
-look at all the other articles in this website.)
ℹ️ Left joins return in the output table all rows from
-x
, i.e., the left table, and only matching rows from
-y
, i.e., the right table.
-
-# Data tables to be joined
-
-df1 <- data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_, 4L),
- x = 11:16)
-
-df2 <- data.frame(id = c(1,2, 4, NA_integer_, 8),
- y = c(11L, 15L, 16, 17L, 18L),
- t = c(13:17))
Example usage of some of the joyn
’s additional
-options:
Updating NAs in left table
-Using the update_NAs
argument from joyn
you
-can update the values that are NA in the t variable in the left
-table with the actual values from the matching column t in the
-right one
-
-left_join(x = df1,
- y = df2,
- relationship = "many-to-one",
- by = "id",
- update_NAs = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 1 16.7%
-#> 2 x & y 4 66.7%
-#> 3 NA updated 1 16.7%
-#> 4 total 6 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, y, and t
-#> id t.x x y t.y .joyn
-#> 1 1 1 11 11 13 x & y
-#> 2 1 2 12 11 13 x & y
-#> 3 2 1 13 15 14 x & y
-#> 4 3 2 14 NA NA x
-#> 5 NA 16 15 17 16 NA updated
-#> 6 NA 4 16 17 16 x & y
Specifying which variables to keep from the right table -after the join
-
-
-left_join(x = df1,
- y = df2,
- relationship = "many-to-one",
- by = "id",
- y_vars_to_keep = "y")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 1 16.7%
-#> 2 y 2 33.3%
-#> 3 x & y 3 50%
-#> 4 total 6 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> id t x y .joyn
-#> 1 1 1 11 11 x & y
-#> 2 1 2 12 11 x & y
-#> 3 2 1 13 15 x & y
-#> 4 3 2 14 NA x
-#> 5 NA NA 15 17 x & y
-#> 6 NA 4 16 17 x & y
ℹ️ Right joins return in the output table matching rows from
-x
, i.e., the left table, and all rows from y
,
-i.e., the right table.
Example usage of some of the joyn
’s additional
-options:
Specifying a name for the reporting -variable
-
-
-right_join(x = df1,
- y = df2,
- relationship = "many-to-one",
- by = "id",
- reportvar = "right.joyn")
-#>
-#> ── JOYn Report ──
-#>
-#> right.joyn n percent
-#> 1 x 1 14.3%
-#> 2 y 2 28.6%
-#> 3 x & y 4 57.1%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable right.joyn
-#> ℹ Note: Removing key variables id from id, y, and t
-#> id t.x x y t.y right.joyn
-#> 1 1 1 11 11 13 x & y
-#> 2 1 2 12 11 13 x & y
-#> 3 2 1 13 15 14 x & y
-#> 4 4 NA NA 16 15 y
-#> 5 8 NA NA 18 17 y
-#> 6 NA NA 15 17 16 x & y
-#> 7 NA 4 16 17 16 x & y
Updating values in common variables
-By setting update_values = TRUE
, all values in x (both
-NAs and not) will be updated with the actual values of variables in y
-with the same name as the ones in x. You can then see the status of the
-update in the reporting variable.
-
-right_join(x = df1,
- y = df2,
- relationship = "many-to-one",
- by = "id",
- reportvar = "right.joyn")
-#>
-#> ── JOYn Report ──
-#>
-#> right.joyn n percent
-#> 1 x 1 14.3%
-#> 2 y 2 28.6%
-#> 3 x & y 4 57.1%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable right.joyn
-#> ℹ Note: Removing key variables id from id, y, and t
-#> id t.x x y t.y right.joyn
-#> 1 1 1 11 11 13 x & y
-#> 2 1 2 12 11 13 x & y
-#> 3 2 1 13 15 14 x & y
-#> 4 4 NA NA 16 15 y
-#> 5 8 NA NA 18 17 y
-#> 6 NA NA 15 17 16 x & y
-#> 7 NA 4 16 17 16 x & y
ℹ️ Full joins return in the output table all rows, both matching and
-non matching rows from x
, i.e., the left table, and
-y
, i.e., the right table.
-
-full_join(x = x1,
- y = y1,
- relationship = "many-to-one",
- keep = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 33.3%
-#> 2 y 1 16.7%
-#> 3 x & y 3 50%
-#> 4 total 6 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id.y, id, and y
-#> id t x id.y y .joyn
-#> <num> <int> <int> <num> <num> <fctr>
-#> 1: 1 1 11 1 11 x & y
-#> 2: 1 2 12 1 11 x & y
-#> 3: 2 1 13 2 15 x & y
-#> 4: 3 2 14 NA NA x
-#> 5: 4 NA NA 4 16 y
-#> 6: NA NA 15 NA NA x
ℹ️ Inner joins return in the output table only rows that match
-between x
, i.e., the left table, and y
, i.e.,
-the right table.
Simple inner join
-
-
-inner_join(x = df1,
- y = df2,
- relationship = "many-to-one",
- by = "id")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 1 20%
-#> 2 y 2 40%
-#> 3 x & y 2 40%
-#> 4 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, y, and t
-#> id t.x x y t.y .joyn
-#> 1 1 1 11 11 13 x & y
-#> 2 1 2 12 11 13 x & y
-#> 3 2 1 13 15 14 x & y
-#> 4 NA NA 15 17 16 x & y
-#> 5 NA 4 16 17 16 x & y
vignettes/main-functionalities.Rmd
- main-functionalities.Rmd
-library(joyn)
-#>
-#> Attaching package: 'joyn'
-#> The following object is masked from 'package:base':
-#>
-#> merge
📌 In joyn
, there are two major sets of tools to join
-data tables:
The primary function joyn()
Dplyr-like join functions: left_join()
,
-right_join()
, full_join()
,
-inner_join()
This vignette will explore the main function joyn()
. You
-can read about dplyr-joins in the “dplyr-joins” article
-instead.
-
-library(joyn)
-library(data.table)
-#> Warning: package 'data.table' was built under R version 4.3.3
-
-x1 <- data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-
-y1 <- data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-
-
-x2 <- data.table(id = c(1, 4, 2, 3, NA),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-
-
-y2 <- data.table(id = c(1, 2, 5, 6, 3),
- yd = c(1, 2, 5, 6, 3),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-
-x3 <- data.table(id = c("c","b", "d", "d"),
- v = 8:11,
- foo = c(4,2, 7, 3))
-
-y3 <- data.table(id = c("c","b", "c", "a"),
- y = c(11L, 15L, 18L, 20L))
-
-
-x4 <- data.table(id1 = c(1, 1, 2, 3, 3),
- id2 = c(1, 1, 2, 3, 4),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-
-y4 <- data.table(id = c(1, 2, 5, 6, 3),
- id2 = c(1, 1, 2, 3, 4),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-
-
-x5 <- data.table(id = c(1, 4, 2, 3, NA),
- t = c(1L, 2L, 1L, 2L, NA),
- country = c(16, 12, 3, NA, 15))
-
-y5 <- data.table(id = c(1, 2, 2, 6, 3),
- gdp = c(11L, 15L, 20L, 13L, 10L),
- country = 16:20)
Let’s suppose that you want to join the two tables x1
-and y1
.
-
-# Calling joyn() to join x1 and y1
-
-joyn(x = x1,
- y = y1,
- match_type = "m:1" ) #Note RT: remove this argument once fixing the default value
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-#> 6: 4 NA NA 16 y
The output table is the result of a full join -which is what
-joyn
always executes by the default. This means that the
-returning table will retains both matching and non matching rows from
-both x1
and y1
. Notice that the resulting
-table also contains an additional variable called .joyn
,
-which is the reporting variable. (Read below ⬇️)
A particular feature of joyn
is that it includes the
-reportvar
in the returning table, which
-informs you about the status of the join. You can modify both the name
-and the format of the reporting variable as follows:
Name: by default reportvar = ".joyn"
, but you can
-modify it with reportvar = "myname"
specifying the name you
-want to assign
Format: by default reporttype = "character"
, but
-you can also set it to numeric using
-reporttype = "numeric"
You can see the difference between the two types in the table below1:
-numeric | -character | -meaning | -
---|---|---|
1 | -x | -Obs only available in x table | -
2 | -y | -Obs only available in y table | -
3 | -x & y | -Matching obs available in both tables | -
4 | -NA updated | -NAs in x updated with actual values in variables with -same names in y | -
5 | -value updated | -Actual values and NAs in x updated with actual values -in variables with same names in y | -
6 | -not updated | -Actual values and NAs in x are NOT updated with actual -values in y | -
When performing a join, you might want to specify which variable(s)
-joyn
should join by.
While by default joyn
will consider the variable(s) in
-common between x
and y
as key(s) for the join,
-our suggestion is to make the keys explicit - i.e., specifying it/them
-in the by
argument
-
-# Join with one variable in common
-
-joyn(x = x1,
- y = y1,
- by = "id",
- match_type = "m:1")
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-#> 6: 4 NA NA 16 y
If you don’t want to join by all variables in common between
-x
and y
, you can alternately use equivalency
-as an element of by
vector. This specification allows you
-to join on different variables between x
and
-y.
-
-joyn(x = x4,
- y = y4,
- by = c("id1 = id", "id2"),
- match_type = "m:m")
-#> id1 id2 t x y .joyn
-#> <num> <num> <int> <num> <int> <fctr>
-#> 1: 1 1 1 16 11 x & y
-#> 2: 1 1 2 12 11 x & y
-#> 3: 2 2 1 NA NA x
-#> 4: 3 3 2 NA NA x
-#> 5: 3 4 NA 15 10 x & y
-#> 6: 2 1 NA NA 15 y
-#> 7: 5 2 NA NA 20 y
-#> 8: 6 3 NA NA 13 y
Also, notice that joyn
will sort
the
-resulting table by key variables in by
. This is because
-sort = TRUE
by default.
💡Match type refers to the relationship that exists between the
-observations of the joining tables. The possibility to perform joins
-based on the match type is one of the value added of using
-joyn
.
Following Stata’s convention, we can have four different match -types:
-1:1 (one to one): the
-default2, the variables specified in by
-variables uniquely identify single observations in both table –> each
-observation in left table has a unique match in the right table and
-viceversa
1:m (one to many): only left table is uniquely
-identified by by
variables –> each observation in
-by
var of the left table can have multiple matches in
-by
var of the right table
m:1 (many to one): only right table is uniquely
-identified by by
var -> each observation in left table
-can have only one match in the right table but observations in the right
-table might have multiple matches in the left table
m:m (many to many): variables in by
-does not uniquely identify the observations in either table –> both
-tables can have multiple matches for each observation
We recommend you always specify the match type when joining tables to -ensure the output is correct.
-
-
-# Many to one match type
-joyn(x = x1,
- y = y1,
- by = "id",
- match_type = "m:1")
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-#> 6: 4 NA NA 16 y
-
-# Many to many match type
-joyn(x = x3,
- y = y3,
- by = "id",
- match_type = "m:m")
-#> id v foo y .joyn
-#> <char> <int> <num> <int> <fctr>
-#> 1: c 8 4 11 x & y
-#> 2: c 8 4 18 x & y
-#> 3: b 9 2 15 x & y
-#> 4: d 10 7 NA x
-#> 5: d 11 3 NA x
-#> 6: a NA NA 20 y
-
-# One to one match type - the default
-joyn(x = x2,
- y = y2,
- by = "id",
- match_type = "1:1")
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 NA 2 15 x & y
-#> 4: 3 2 NA 3 10 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 5 20 y
-#> 7: 6 NA NA 6 13 y
-
-# Same join as:
-
-joyn(x = x2,
- y = y2,
- by = "id")
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 NA 2 15 x & y
-#> 4: 3 2 NA 3 10 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 5 20 y
-#> 7: 6 NA NA 6 13 y
-
-
-# One to many match type
-joyn(x = x5,
- y = y5,
- by = "id",
- match_type = "1:m")
-#> id t country gdp .joyn
-#> <num> <int> <num> <int> <fctr>
-#> 1: 1 1 16 11 x & y
-#> 2: 4 2 12 NA x
-#> 3: 2 1 3 15 x & y
-#> 4: 2 1 3 20 x & y
-#> 5: 3 2 NA 10 x & y
-#> 6: NA NA 15 NA x
-#> 7: 6 NA NA 13 y
However, if are unsure/wrong about the relationships between the
-observations in your tables, joyn
will let you know that
-something is not right. Suppose you think your data is uniquely
-identified by variable id
, while it is not. By setting
-match_type = "1:1"
you will get and error, informing you
-that the match type is not as expected.
-
-# Merging correctly but getting error because something is not right in the data
-joyn(x3, y3, by = "id", match_type = "1:1")
-#> ✖ Error: table x is not uniquely identified by id
-#> ✖ Error: table y is not uniquely identified by id
-#> Error in `check_match_type()`:
-#> ! match type inconsistency
-#> ℹ set verbose to TRUE to see where the issue is
-
-# Merging wrongly but getting NO errors because you did not use match_type
-joyn(x3, y3, by = "id")
-#> ✖ Error: table x is not uniquely identified by id
-#> ✖ Error: table y is not uniquely identified by id
-#> Error in `check_match_type()`:
-#> ! match type inconsistency
-#> ℹ set verbose to TRUE to see where the issue is
If instead you don’t care about match types or you don’t think it is
-necessary to use them for your particular needs, you might be fine
-without joyn
.
Join type determines which observations will be kept after the join.
-joyn()
allows you to choose which type of join to execute
-via the keep
argument.
This argument is called keep
rather than
-join_type
to avoid confusion with the argument
-match_type
, and in order to reflect that what you are
-specifying in the end is which observations you want to keep. This
-argument plays the role of allowing joyn()
to mimic the
-behavior of dplyr
’s functions left_join
,
-right_join
, inner_join
, and
-full_join
, the default.
keep
can be of four types:
keep = "full"
: the default, which
-keeps all the observations in x and y, regardless of whether
-they match or not.
-
-# Full join
-
-joyn(x = x1,
- y = y1,
- match_type = "m:m")
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-#> 6: 4 NA NA 16 y
keep = "left"
or
-keep = "master"
: keeps all observations
-in x
, both matching and non, and only those observations in
-y
that match in x
-
-# keep obs in x
-
-joyn(x = x1,
- y = y1,
- keep = "left",
- match_type = "m:m")
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
keep = "right"
or
-keep = "using"
keeps all observations in
-y
, both matching and non, and only those observations in
-x
that match in y
-
-# keep obs in y
-
-joyn(x = x1,
- y = y1,
- keep = "right",
- match_type = "m:m")
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 4 NA NA 16 y
keep = "inner"
keeps only those
-observations that match in both tables.
-
-# keep matching obs in both tables
-
-joyn(x1, y1, keep = "inner", match_type = "m:m")
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
Recall that joyn
is intended to be informative about the
-status and quality of the merging.
📊 JOYn report
-By default, joyn
returns the JOYn report ,
-i.e., a summary table of the merging. This includes the reporting
-variable, the number of rows that come from x
, the number
-of rows that come from y
and those that are common to both
-x
and y
. This info is also shown in percentage
-form in the percent column.
-
-joyn(x = x3,
- y = y3,
- by = "id",
- match_type = "m:m",
- verbose = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 33.3%
-#> 2 y 1 16.7%
-#> 3 x & y 3 50%
-#> 4 total 6 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id v foo y .joyn
-#> <char> <int> <num> <int> <fctr>
-#> 1: c 8 4 11 x & y
-#> 2: c 8 4 18 x & y
-#> 3: b 9 2 15 x & y
-#> 4: d 10 7 NA x
-#> 5: d 11 3 NA x
-#> 6: a NA NA 20 y
📝 Displaying messages
-One of the value added of joyn
is that it produces a
-number of messages that are intended to inform you about the status of
-the join. The display of such messages is controlled by the argument
-verbose
, which allows you to show
-(verbose = TRUE
) or silent (verbose = FALSE
)
-any messages.
To further explore messages in joyn
, please refer to the
-“Messages” article.
-
-library(joyn)
-#>
-#> Attaching package: 'joyn'
-#> The following object is masked from 'package:base':
-#>
-#> merge
-library(data.table)
-#> Warning: package 'data.table' was built under R version 4.3.3
-
- x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
- y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-
- x2 = data.table(id1 = c(1, 1, 2, 3, 3),
- id2 = c(1, 1, 2, 3, 4),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-
- y2 = data.table(id = c(1, 2, 5, 6, 3),
- id2 = c(1, 1, 2, 3, 4),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-
This vignette describes the use of the joyn
-merge()
function.
🔀 joyn::merge
resembles the usability of
-base::merge
and data.table::merge
, while also
-incorporating the additional features that characterize
-joyn
. In fact, joyn::merge
masks the other
-two.
Suppose you want to merge x1
and y1
. First
-notice that while base::merge
is principally for data
-frames, joyn::merge
coerces x
and
-y
to data tables if they are not already.
By default, merge
will join by the shared column name(s)
-in x
and y
.
-
-# Example not specifying the key
-merge(x = x1,
- y = y1)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 66.7%
-#> 2 y 1 33.3%
-#> 3 total 3 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> ⚠ Warning: The keys supplied uniquely identify y, therefore a m:1 join is
-#> executed
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-
-# Example specifying the key
-merge(x = x1,
- y = y1,
- by = "id")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 66.7%
-#> 2 y 1 33.3%
-#> 3 total 3 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> ⚠ Warning: The keys supplied uniquely identify y, therefore a m:1 join is
-#> executed
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
As usual, if the columns you want to join by don’t have the same
-name, you need to tell merge which columns you want to join
-by: by.x
for the x data frame column name,
-and by.y
for the y one. For example,
-
-df1 <- data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_, 4L),
- x = 11:16)
-
-df2 <- data.frame(id = c(1,2, 4, NA_integer_, 8),
- y = c(11L, 15L, 16, 17L, 18L),
- t = c(13:17))
-
-merge(x = df1,
- y = df2,
- by.x = "x",
- by.y = "y")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 3 100%
-#> 2 y 2 66.7%
-#> 3 total 3 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables keyby1 from id, keyby1, and t
-#> ⚠ Warning: The keys supplied uniquely identify both x and y, therefore a 1:1
-#> join is executed
-#> id.x t.x x id.y t.y .joyn
-#> 1 1 1 11 1 13 x & y
-#> 2 NA NA 15 2 14 x & y
-#> 3 NA 4 16 4 15 x & y
By default, sort
is TRUE
, so that the
-merged table will be sorted by the by.x
column. Notice that
-the output table distinguishes non-by column t coming from
-x
from the one coming from y
by adding the
-.x and .y suffixes -which occurs because the
-no.dups
argument is set to TRUE
by
-default.
In a similar fashion as the joyn()
primary function
-does, merge()
offers a number of arguments to
-verify/control the merge1.
For example, joyn::joyn
allows to execute one-to-one,
-one-to-many, many-to-one and many-to-many joins. Similarly,
-merge
accepts the match_type
argument:
-
-# Example with many to many merge
-joyn::merge(x = x2,
- y = y2,
- by.x = "id1",
- by.y = "id2",
- match_type = "m:m")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 y 1 14.3%
-#> 2 x & y 6 85.7%
-#> 3 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables keyby1 from id, keyby1, y, and x
-#> ⚠ Warning: Supplied both by and by.x/by.y. by argument will be ignored.
-#> id1 id2 t x.x id y x.y .joyn
-#> <num> <num> <int> <num> <num> <int> <int> <fctr>
-#> 1: 1 1 1 16 1 11 16 x & y
-#> 2: 1 1 1 16 2 15 17 x & y
-#> 3: 1 1 2 12 1 11 16 x & y
-#> 4: 1 1 2 12 2 15 17 x & y
-#> 5: 2 2 1 NA 5 20 18 x & y
-#> 6: 3 3 2 NA 6 13 19 x & y
-#> 7: 3 4 NA 15 6 13 19 x & y
-
-# Example with many to many merge
-joyn::merge(x = x1,
- y = y1,
- by = "id",
- match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 66.7%
-#> 2 y 1 33.3%
-#> 3 total 3 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> ⚠ Warning: Supplied both by and by.x/by.y. by argument will be ignored.
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
In a similar way, you can exploit all the other additional options
-available in joyn()
, e.g., for keeping common variables,
-updating NAs and values, displaying messages etc…, which you can explore
-in the “Advanced functionalities” article.
✅ This vignette is dedicated to one specific feature of
-joyn
: displaying information through
-messages.
We’ll start with a rough overview of the different kinds of messages -that might be generated when merging two data tables, then discuss each -of them in detail with representative examples.
-Joyn
messages can be of 4 different types:
Info
Timing
Warning
Error
-
-# Setup
-library(joyn)
-#>
-#> Attaching package: 'joyn'
-#> The following object is masked from 'package:base':
-#>
-#> merge
-library(data.table)
-#> Warning: package 'data.table' was built under R version 4.3.3
-
-# Checking available types of messages
-msgs_types = joyn:::type_choices()
-print(msgs_types)
-#> [1] "info" "note" "warn" "timing" "err"
Info messages are intended to inform you about various aspects of the -join and the data tables involved, as you can see in the examples -below.
-Recall that one of the additional features of joyn
is
-that it returns a reporting variable with the status of the join.
-Examples in this regard include info messages that tell you in which
-variable it is available the joyn
report, or if the
-reporting variable is not returned instead.
Recall that one of the additional features of joyn is that it returns
-a reporting variable with the status of the join.
-Examples in this regard include info messages that tell you in which
-variable it is available the joyn
report, or if the
-reporting variable is not returned instead. Also, an info message might
-let you know that the name you want to assign to the reporting variable
-is already present in the returning table, so that it will be changed to
-a another one.
-
-# Example dataframes
-
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-
-y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-
-
-x2 = data.table(id = c(1, 4, 2, 3, NA),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-
-
-y2 = data.table(id = c(1, 2, 5, 6, 3),
- yd = c(1, 2, 5, 6, 3),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-
-x3 = data.table(id1 = c(1, 1, 2, 3, 3),
- id2 = c(1, 1, 2, 3, 4),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-
-
-y3 = data.table(id3 = c(1, 2, 5, 6, 3),
- id4 = c(1, 1, 2, 3, 4),
- y = c(11L, 15L, 20L, 13L, 10L),
- z = c(16:20))
-
-
-
-# ------------------- Showing which var contains joyn report -------------------
-
-# Joining x2 and y2
-joyn(x = x2,
- y = y2,
- by = "id",
- y_vars_to_keep = FALSE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> id t x .joyn
-#> <num> <int> <num> <fctr>
-#> 1: 1 1 16 x & y
-#> 2: 4 2 12 x
-#> 3: 2 1 NA x & y
-#> 4: 3 2 NA x & y
-#> 5: NA NA 15 x
-#> 6: 5 NA NA y
-#> 7: 6 NA NA y
-
-# Printing the info message
-joyn_msg(msg_type = "info")
-#> ℹ Note: Joyn's report available in variable .joyn
-
-# ---------------- Info about change in reporting variable name ----------------
-joyn(x = x2,
- y = y2,
- by = "id",
- reportvar = "x",
- y_vars_to_keep = FALSE)
-#>
-#> ── JOYn Report ──
-#>
-#> x.1 n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable x
-#> ℹ Note: reportvar x is already part of the resulting table. It will be changed
-#> to x.1
-#> id t x x.1
-#> <num> <int> <num> <fctr>
-#> 1: 1 1 16 x & y
-#> 2: 4 2 12 x
-#> 3: 2 1 NA x & y
-#> 4: 3 2 NA x & y
-#> 5: NA NA 15 x
-#> 6: 5 NA NA y
-#> 7: 6 NA NA y
-
-joyn_msg(msg_type = "info")
-#> ℹ Note: Joyn's report available in variable x
-#> ℹ Note: reportvar x is already part of the resulting table. It will be changed
-#> to x.1
-
-# ------------- Informing that reporting variable is not returned -------------
-joyn(x = x2,
- y = y2,
- by = "id",
- reportvar = FALSE,
- y_vars_to_keep = FALSE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Reporting variable is NOT returned
-#> id t x
-#> <num> <int> <num>
-#> 1: 1 1 16
-#> 2: 4 2 12
-#> 3: 2 1 NA
-#> 4: 3 2 NA
-#> 5: NA NA 15
-#> 6: 5 NA NA
-#> 7: 6 NA NA
-
-joyn_msg(msg_type = "info")
-#> ℹ Note: Reporting variable is NOT returned
Furthermore, info messages will help you keep track of which
-variables in y
will be kept after
-the merging, for example notifying you if any of the y
-variables you have specified to keep will be removed because they are
-part of the by
variables.
-
-joyn(x = x2,
- y = y2,
- by = "id",
- y_vars_to_keep = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 NA 2 15 x & y
-#> 4: 3 2 NA 3 10 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 5 20 y
-#> 7: 6 NA NA 6 13 y
-
-joyn_msg(msg_type = "info")
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
Timing messages report in how many seconds the join is executed, -including the time spent to perform all checks.
-While performing the join, joyn
keeps track of the
-time spent for the execution. This is then displayed in
-timing messages, which report the elapsed time measured in seconds.
Before visualizing some examples, it is important to remind a feature
-of how joyn
executes any join between two data tables.
Specifically, joyn
always first executes a full join
-between the data tables - which includes all matching and non matching
-rows in the resulting table. Then, it filters the rows depending on the
-specific type of join that user wants to execute. For example, if the
-user sets keep = "right"
, joyn
will filter the
-table resulting from the full join and return to the user the data table
-retaining all rows from the right table and only
-matching rows from the left table. In addition, note that since
-joyn
performs a number of checks throughout the execution
-(e.g., checking that the specified key for the merge is valid, or the
-match type consistency), the time spent on checks will also be included
-in reported time.
As a result, timing messages enable you to be aware of both:
-
-
-# --------------------------- Example with full join ---------------------------
-
-joyn(x = x1,
- y = y1,
- match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 33.3%
-#> 2 y 1 16.7%
-#> 3 x & y 3 50%
-#> 4 total 6 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-#> 6: 4 NA NA 16 y
-
-joyn_msg("timing")
-#> ● Timing:The full joyn is executed in 0.000251 seconds.
-#> ● Timing: The entire joyn function, including checks, is executed in 0.020076
-#> seconds.
-
-
-# --------------------------- Example with left join ---------------------------
-left_join(x = x1,
- y = y1,
- relationship = "many-to-one")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 y 1 20%
-#> 3 x & y 2 40%
-#> 4 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-
-joyn_msg("timing")
-#> ● Timing:The full joyn is executed in 0.000499 seconds.
-#> ● Timing: The entire joyn function, including checks, is executed in 0.01727
-#> seconds.
joyn
generates warning messages to alert you about
-possible problematic situation which however do not warrant terminating
-execution of the merge.
For example, if you provide a match type that is inconsistent with
-the data, joyn
will generate a warning to inform you about
-the actual relationship and to alert that the join will be executed
-accordingly.
In the example below, both x2
and y2
are
-uniquely identified by the key id
, but the user is choosing
-a “one to many” relationship instead. The user will be alerted and a
-“one to one” join will be executed instead.
-
-# Warning that "id" uniquely identifies y2
-
-joyn(x2, y2, by = "id", match_type = "1:m", verbose = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> ⚠ Warning: The keys supplied uniquely identify y, therefore a 1:1 join is
-#> executed
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 NA 2 15 x & y
-#> 4: 3 2 NA 3 10 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 5 20 y
-#> 7: 6 NA NA 6 13 y
-joyn_msg("warn")
-#> ⚠ Warning: The keys supplied uniquely identify y, therefore a 1:1 join is
-#> executed
In a similar way, warning messages are generated when choosing
-match_type = "m:m" or "m:1"
-
-# ------------ Warning that "id" uniquely identifies both x2 and y2 ------------
-
-joyn(x2, y2, by = "id", match_type = "m:m", verbose = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> ⚠ Warning: The keys supplied uniquely identify both x and y, therefore a 1:1
-#> join is executed
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 NA 2 15 x & y
-#> 4: 3 2 NA 3 10 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 5 20 y
-#> 7: 6 NA NA 6 13 y
-joyn_msg("warn")
-#> ⚠ Warning: The keys supplied uniquely identify both x and y, therefore a 1:1
-#> join is executed
-
-# ------------------ Warning that "id" uniquely identifies x2 ------------------
-
-joyn(x2, y2, by = "id", match_type = "m:1", verbose = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> ⚠ Warning: The keys supplied uniquely identify x, therefore a 1:1 join is
-#> executed
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 NA 2 15 x & y
-#> 4: 3 2 NA 3 10 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 5 20 y
-#> 7: 6 NA NA 6 13 y
-joyn_msg("warn")
-#> ⚠ Warning: The keys supplied uniquely identify x, therefore a 1:1 join is
-#> executed
Other examples of warnings are those that arise when you are trying
-to supply certain arguments to the merging functions that are not yet
-supported by the current version of joyn
.
Suppose you are executing a left-join and you try to set the
-na_matches
argument to ‘never’. joyn
will warn
-you that it currently allows only na_matches = 'na'
. A
-similar message is displayed when keep = NULL
. Given that
-the current version of joyn
does not support inequality
-joins, joyn
will warn you that keep = NULL
-will make the join retain only keys in x
.
-
-joyn::left_join(x = x1,
- y = y1,
- relationship = "many-to-one",
- keep = NULL,
- na_matches = "never")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 y 1 20%
-#> 3 x & y 2 40%
-#> 4 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> ⚠ Warning: joyn does not currently allow inequality joins, so keep = NULL will
-#> retain only keys in x
-#> ⚠ Warning: Currently, joyn allows only na_matches = 'na'
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-
-joyn_msg("warn")
-#> ⚠ Warning: joyn does not currently allow inequality joins, so keep = NULL will
-#> retain only keys in x
-#> ⚠ Warning: Currently, joyn allows only na_matches = 'na'
Error messages act as helpful notifications about the reasons why the -join you are trying to perform can’t be executed. Error messages -highlight where you went off course and provide clues to fix the issue -so that the merging can be successfully executed.
-Sometimes error messages are due to a wrong/missing provision of the
-inputs, for example if you do not supply variables to be used as key for
-the merge, and x
and y
do not have any common
-variable names. Error messages will also pop up if you provide an input
-data table that has no variables, or that has duplicate variable
-names.
Representative messages in this regard can be visualized below:
-
-
-# ----------------- Error due to input table x with no columns -----------------
-
-x_empty = data.table()
-
-joyn(x = x_empty,
- y = y1)
-#> ✖ Error: Input table x has no columns.
-#> Error in `check_xy()`:
-#> ! wrong input specification
-
-joyn_msg("err")
-#> ✖ Error: Input table x has no columns.
-
-# ----------------------- Error due to duplicate names ------------------------
-
-x_duplicates = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- x = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15,
- check.names = FALSE)
-joyn(x = x_duplicates,
- y = y1)
-#> ✖ Error: Table x has the following column duplicated: x. Please rename or
-#> remove and try again.
-#> Error in `check_xy()`:
-#> ! wrong input specification
-
-joyn_msg("err")
-#> ✖ Error: Table x has the following column duplicated: x. Please rename or
-#> remove and try again.
Furthermore, errors messages are generated when choosing the wrong
-match_type
, that is not consistent with the actual
-relationship between the variables being used for merging.
-joyn
will therefore display the following message:
-
-joyn(x = x1, y=y1, by="id", match_type = "1:1")
-#> ✖ Error: table x is not uniquely identified by id
-#> Duplicate counts in x:
-#> id copies
-#> <int> <int>
-#> 1: 1 2
-#> Error in `check_match_type()`:
-#> ! match type inconsistency
-#> ℹ refer to the duplicate counts in the table(s) above to identify where the
-#> issue occurred
-joyn_msg("err")
-#> ✖ Error: table x is not uniquely identified by id
joyn
messages?
-joyn
stores the messages in the joyn
-environment.
In order to print them, you can use the joyn_msg()
-function. The msg_type
argument allows you to specify a
-certain type of message you would like to visualize, or, if you want all
-of them to be displayed, you can just set type = 'all'
-
-# Execute a join
-
-joyn(x = x1,
- y = y1,
- match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 33.3%
-#> 2 y 1 16.7%
-#> 3 x & y 3 50%
-#> 4 total 6 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-#> 6: 4 NA NA 16 y
-
-# Print all messages stored
-joyn_msg(msg_type = "all")
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> ● Timing:The full joyn is executed in 0.000206 seconds.
-#> ● Timing: The entire joyn function, including checks, is executed in 0.016425
-#> seconds.
-
-# Print info messages only
-joyn_msg(msg_type = "info")
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
joyn
empowers you to assess the results of joining data frames, making it easier and more efficient to combine your tables. Similar in philosophy to the merge
command in Stata
, joyn
offers matching key variables and detailed join reports to ensure accurate and insightful results.
Merging tables in R can be tricky. Ensuring accuracy and understanding the joined data fully can be tedious tasks. That’s where joyn
comes in. Inspired by Stata’s informative approach to merging, joyn
makes the process smoother and more insightful.
While standard R merge functions are powerful, they often lack features like assessing join accuracy, detecting potential issues, and providing detailed reports. joyn
fills this gap by offering:
joyn
helps you navigate them confidently.joyn
special?
-While standard R merge functions offer basic functionality, joyn
goes above and beyond by providing comprehensive tools and features tailored to your data joining needs:
1. Flexibility in join types: Choose your ideal join type (“left”, “right”, or “inner”) with the keep
argument. Unlike R’s default, joyn
performs a full join by default, ensuring all observations are included, but you have full control to tailor the results.
2. Seamless variable handling: No more wrestling with duplicate variable names! joyn
offers multiple options:
Update values: Use update_values
or update_NA
to automatically update conflicting variables in the left table with values from the right table.
Keep both (with different names): Enable keep_common_vars = TRUE
to retain both variables, each with a unique suffix.
Selective inclusion: Choose specific variables from the right table with y_vars_to_keep
, ensuring you get only the data you need.
3. Relationship awareness: joyn
recognizes one-to-one, one-to-many, many-to-one, and many-to-many relationships between tables. While it defaults to many-to-many for compatibility, remember this is often not ideal. Always specify the correct relationship using by
arguments for accurate and meaningful results.
4. Join success at a glance: Get instant feedback on your join with the automatically generated reporting variable. Identify potential issues like unmatched observations or missing values to ensure data integrity and informed decision-making.
-By addressing these common pain points and offering enhanced flexibility, joyn
empowers you to confidently and effectively join your data frames, paving the way for deeper insights and data-driven success.
While raw speed is essential, understanding your joins every step of the way is equally crucial. joyn
prioritizes providing insightful information and preventing errors over solely focusing on speed. Unlike other functions, it adds:
joyn
performs comprehensive checks to ensure your join is accurate and avoids potential missteps, like unmatched observations or missing values.These valuable features contribute to a slightly slower performance compared to functions like data.table::merge.data.table()
or collapse::join()
. However, the benefits of preventing errors and gaining invaluable insights far outweigh the minor speed difference.
data.table
or collapse
directly.joyn
is your trusted guide.joyn
intentionally restricts certain actions and provides clear messages when encountering unexpected data configurations. This might seem opinionated, but it’s designed to protect you from accidentally creating inaccurate or misleading joins. This “safety net” empowers you to confidently merge your data, knowing joyn
has your back.
joyn
as wrapper: Familiar Syntax, Familiar Power
-While joyn::join()
offers the core functionality and Stata-inspired arguments, you might prefer a syntax more aligned with your existing workflow. joyn
has you covered!
Embrace base R and data.table
:
joyn::merge()
: Leverage familiar base R and data.table
syntax for seamless integration with your existing code.Join with flair using dplyr
:
joyn::{dplyr verbs}()
: Enjoy the intuitive verb-based syntax of dplyr
for a powerful and expressive way to perform joins.Dive deeper: Explore the corresponding vignettes to unlock the full potential of these alternative interfaces and find the perfect fit for your data manipulation style.
-You can install the stable version of joyn
from CRAN with:
-install.packages("joyn")
The development version from GitHub with:
-
-# install.packages("devtools")
-devtools::install_github("randrescastaneda/joyn")
-
-library(joyn)
-#>
-#> Attaching package: 'joyn'
-#> The following object is masked from 'package:base':
-#>
-#> merge
-library(data.table)
-
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-
-y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-
-
-x2 = data.table(id = c(1, 4, 2, 3, NA),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-
-
-y2 = data.table(id = c(1, 2, 5, 6, 3),
- yd = c(1, 2, 5, 6, 3),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-
-# using common variable `id` as key.
-joyn(x = x1,
- y = y1,
- match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 33.3%
-#> 2 y 1 16.7%
-#> 3 x & y 3 50%
-#> 4 total 6 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-#> 6: 4 NA NA 16 y
-
-# keep just those observations that match
-joyn(x = x1,
- y = y1,
- match_type = "m:1",
- keep = "inner")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 66.7%
-#> 2 y 1 33.3%
-#> 3 total 3 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-
-# Bad merge for not specifying by argument
-joyn(x = x2,
- y = y2,
- match_type = "1:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 4 44.4%
-#> 2 y 4 44.4%
-#> 3 x & y 1 11.1%
-#> 4 total 9 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id and x from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 NA NA NA x
-#> 4: 3 2 NA NA NA x
-#> 5: NA NA 15 NA NA x
-#> 6: 2 NA 17 2 15 y
-#> 7: 5 NA 18 5 20 y
-#> 8: 6 NA 19 6 13 y
-#> 9: 3 NA 20 3 10 y
-
-# good merge, ignoring variable x from y
-joyn(x = x2,
- y = y2,
- by = "id",
- match_type = "1:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 NA 2 15 x & y
-#> 4: 3 2 NA 3 10 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 5 20 y
-#> 7: 6 NA NA 6 13 y
-
-# update NAs in var x in table x from var x in y
-joyn(x = x2,
- y = y2,
- by = "id",
- update_NAs = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 x & y 1 14.3%
-#> 3 NA updated 4 57.1%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 4 2 12 NA NA x
-#> 3: 2 1 17 2 15 NA updated
-#> 4: 3 2 20 3 10 NA updated
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA 18 5 20 NA updated
-#> 7: 6 NA 19 6 13 NA updated
-
-# update values in var x in table x from var x in y
-joyn(x = x2,
- y = y2,
- by = "id",
- update_values = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 NA updated 4 57.1%
-#> 2 value updated 1 14.3%
-#> 3 not updated 2 28.6%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 value updated
-#> 2: 4 2 12 NA NA not updated
-#> 3: 2 1 17 2 15 NA updated
-#> 4: 3 2 20 3 10 NA updated
-#> 5: NA NA 15 NA NA not updated
-#> 6: 5 NA 18 5 20 NA updated
-#> 7: 6 NA 19 6 13 NA updated
-
-
-# do not bring any variable from y into x, just the report
-joyn(x = x2,
- y = y2,
- by = "id",
- y_vars_to_keep = NULL)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 28.6%
-#> 2 y 2 28.6%
-#> 3 x & y 3 42.9%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> id t x .joyn
-#> <num> <int> <num> <fctr>
-#> 1: 1 1 16 x & y
-#> 2: 4 2 12 x
-#> 3: 2 1 NA x & y
-#> 4: 3 2 NA x & y
-#> 5: NA NA 15 x
-#> 6: 5 NA NA y
-#> 7: 6 NA NA y
NEWS.md
- CRAN release: 2024-08-21
-CRAN release: 2024-07-10
-Add anti_join()
function.
Add unmask_joyn()
function to unmask joyn
functions that mask dplyr
equivalents.
Add information about duplicated obs in by
variable when match type is 1
rathern than m
.
improve inefficiencies in deep copies with m:m
joins
Replace m:m
joins from data.table::merge.data.table
to collapse::join
. Thanks to @SebKrantz for the suggestion (#58).
Add information about duplicated obs in by
variable when match type is 1
rather than m
.
Internal: improve storing of joyn messages.
Improve creation of reporting variable. Now, it is created in [collapse::join] rather than in joyn
function. In addition, the reporting variable is created as factor to improve performance. Thanks to @SebKrantz for the suggestion (#58)
Now, by default, joyn
will not sort the data. This is to avoid unnecessary computational time that most of the time is not needed. If the user wants to sort the data, they can use the sort
argument, which triggers the sorting mechanism of collapse
package.
report variable (named “.join” by default) is now a factor instead of character. Yet, users can still use character if they want with the reporttype = "character"
.
CRAN release: 2024-03-29
-joyn
has gained two new authors: Zander Prinsloo and Rossana Tatulli.Function joyn::merge()
was replaced by joyn::joyn()
. This is now the main function of the joyn
package.
Arguments allow.cartesian
, yvars
, and keep_y_in_x
have been deprecated. The latter two have been replaced by y_vars_to_keep
and keep_common_vars
, respectively. The new argument names bring more clarity about what they arguments do.
New function joyn::merge()
works as a mask for the base::merge()
or data.table::merge.data.table()
. joyn::merge()
has the same features as the previous two, but includes the features of joyn::joyn()
.
Messages style have been improved and categorized. See message vignette for more information.
New functions to mimic dplyr joins. The joyn
variants have all the features for joyn::joyn()
but lack some of the most advance features of dplyr
joins like joyn::join_by()
CRAN release: 2021-12-14
-update_NAs now could be FALSE even if update_values is TRUE
Select rows-to-keep before transformation of updated values and NAs to avoid keeping rows from y that did not match in x but whose values got updated because update_values = TRUE
Change to data.table::merge.data.table syntax in all joins. It makes it easier to work with and consistent across different join types.
Remove previous lazy-loaded data.
possible_ids()
to identify what variables are suitable for uniquely identify the database.Add function is_id()
to check whether the table is uniquely identified by key variables
Add function freq_table()
as a substitute for janitor::tabyl. This makes it more convenient for users who do not have janitor installed.
R/dplyr-joins.R
- arguments_checks.Rd
Perform necessary preliminary checks on arguments that are passed to joyn
-arguments_checks(
- x,
- y,
- by,
- copy,
- keep,
- suffix,
- na_matches,
- multiple,
- relationship,
- reportvar
-)
data frame: left table
data frame: right table
character vector or variables to join by
If x
and y
are not from the same data source,
-and copy
is TRUE
, then y
will be copied into the
-same src as x
. This allows you to join tables across srcs, but
-it is a potentially expensive operation so you must opt into it.
Should the join keys from both x
and y
be preserved in the
-output?
If NULL
, the default, joins on equality retain only the keys from x
,
-while joins on inequality retain the keys from both inputs.
If TRUE
, all keys from both inputs are retained.
If FALSE
, only keys from x
are retained. For right and full joins,
-the data in key columns corresponding to rows that only exist in y
are
-merged into the key columns from x
. Can't be used when joining on
-inequality conditions.
If there are non-joined duplicate variables in x
and
-y
, these suffixes will be added to the output to disambiguate them.
-Should be a character vector of length 2.
Should two NA
or two NaN
values match?
Handling of rows in x
with multiple matches in y
.
-For each row of x
:
"all"
, the default, returns every match detected in y
. This is the
-same behavior as SQL.
"any"
returns one match detected in y
, with no guarantees on which
-match will be returned. It is often faster than "first"
and "last"
-if you just need to detect if there is at least one match.
"first"
returns the first match detected in y
.
"last"
returns the last match detected in y
.
Handling of the expected relationship between the keys of
-x
and y
. If the expectations chosen from the list below are
-invalidated, an error is thrown.
NULL
, the default, doesn't expect there to be any relationship between
-x
and y
. However, for equality joins it will check for a many-to-many
-relationship (which is typically unexpected) and will warn if one occurs,
-encouraging you to either take a closer look at your inputs or make this
-relationship explicit by specifying "many-to-many"
.
See the Many-to-many relationships section for more details.
"one-to-one"
expects:
Each row in x
matches at most 1 row in y
.
Each row in y
matches at most 1 row in x
.
"one-to-many"
expects:
Each row in y
matches at most 1 row in x
.
"many-to-one"
expects:
Each row in x
matches at most 1 row in y
.
"many-to-many"
doesn't perform any relationship checks, but is provided
-to allow you to be explicit about this relationship if you know it
-exists.
relationship
doesn't handle cases where there are zero matches. For that,
-see unmatched
.
character: Name of reporting variable. Default is ".joyn". -This is the same as variable "_merge" in Stata after performing a merge. If -FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.
This function checks the variable name(s) to be used as key(s) of the join
-A vector of shared column names in x
and y
to merge on.
-This defaults to the shared key columns between the two tables.
-If y
has no key columns, this defaults to the key of x
.
data table
s. y
is coerced to a data.table
if
-it isn't one already.
if (FALSE) {
-x1 = data.frame(
- id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.frame(id = 1:2,
- y = c(11L, 15L))
-# With var "id" shared in x and y
-joyn:::check_by_vars(by = "id", x = x1, y = y1)
-}
-
check variable(s) by which data frames are joined: either a single by
var, common to right and left dt,
-or
left table
right table
character: variable to join by (common variable to x and y)
character: specified var in x to join by
character: specified var in y to join by
if (FALSE) {
-x = data.table(id1 = c(1, 1, 2, 3, 3),
- id2 = c(1, 1, 2, 3, 4),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-y = data.table(id = c(1, 2, 5, 6, 3),
- id2 = c(1, 1, 2, 3, 4),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-# example specifying by.x and by.y
-joyn:::check_dt_by(x, y, by.x = "id1", by.y = "id2")
-}
-
Check if vars in dt have duplicate names
-if (FALSE) {
-# When no duplicates
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-joyn:::check_duplicate_names(x1, "x")
-
-# When duplicates
-x1_duplicates = data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_),
- x = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15,
- check.names = FALSE)
-joyn:::check_duplicate_names(x1_duplicates, "x")
-}
-
This function checks if the match type chosen by the user is consistent with the data.
-
(Match type must be one of the valid types: "1:1", "1:m", "m:1", "m:m")
check_match_type(x, y, by, match_type, verbose = getOption("joyn.verbose"))
data table
s. y
is coerced to a data.table
if
-it isn't one already.
A vector of shared column names in x
and y
to merge on.
-This defaults to the shared key columns between the two tables.
-If y
has no key columns, this defaults to the key of x
.
character: one of "m:m", "m:1", "1:m", "1:1". -Default is "1:1" since this the most restrictive. However, following -Stata's recommendation, it is better to be explicit and use any of the -other three match types (See details in match types sections).
character vector from split_match_type
- - -if (FALSE) {
-# Consistent match type
-x1 = data.frame(
- id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.frame(id = 1:2,
- y = c(11L, 15L))
-joyn:::check_match_type(x = x1, y=y1, by="id", match_type = "m:1")
-
-# Inconsistent match type
-joyn:::check_match_type(x = x1, y=y1, by="id", match_type = "1:1")
-}
-
R/checks.R
- check_new_y_vars.Rd
Check vars in y with same names as vars in x, and return new variables names for those y vars for the joined data frame
-master table
character: by vars
character vector of y variables to keep
if (FALSE) {
-y2 = data.frame(id = c(1, 2, 5, 6, 3),
- yd = c(1, 2, 5, 6, 3),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-joyn:::y_vars_to_keep <- check_y_vars_to_keep(TRUE, y2, by = "id")
-x2 = data.frame(id = c(1, 1, 2, 3, NA),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-joyn:::check_new_y_vars(x = x2, by="id", y_vars_to_keep)
-}
-
check reportvar input
If resulting data frame has a reporting variable (storing joyn's report), check and return a valid name.
check_reportvar(reportvar, verbose = getOption("joyn.verbose"))
if input reportvar is character, return valid name for the report var. If NULL or FALSE, return NULL.
-if (FALSE) {
-# When null - reporting variable not returned in merged dt
-joyn:::check_reportvar(reportvar = NULL)
-# When FALSE - reporting variable not returned in merged dt
-joyn:::check_reportvar(reportvar = FALSE)
-# When character
-joyn:::check_reportvar(reportvar = ".joyn")
-}
-
R/dplyr-joins.R
- check_unmatched_keys.Rd
Conduct all unmatched keys checks and return error if necessary
-This function performs checks inspired on merge.data.table: it detects errors
if x and/or y have no columns
if x and/or y contain duplicate column names
data frame: referred to as left in R terminology, or master in -Stata terminology.
data frame: referred to as right in R terminology, or using in -Stata terminology.
if (FALSE) {
-# Check passing with no errors
-library(data.table)
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-joyn:::check_xy(x = x1, y=y1)
-}
-
R/checks.R
- check_y_vars_to_keep.Rd
check and return variable names in y to keep in returning table, excluding those that are keys of the merge
-either TRUE, if keep all vars in y
;
-FALSE or NULL, if keep no vars; or character vector specifying which variables in y
to keep
data frame
A vector of shared column names in x
and y
to merge on.
-This defaults to the shared key columns between the two tables.
-If y
has no key columns, this defaults to the key of x
.
if (FALSE) {
-y1 = data.table(id = 1:2,
- y = c(11L, 15L))
-# With y_vars_to_keep TRUE
-joyn:::check_y_vars_to_keep(TRUE, y1, by = "id")
-# With y_vars_to_keep FALSE
-joyn:::check_y_vars_to_keep(FALSE, y1, by = "id")
-# Specifying which y vars to keep
-joyn:::check_y_vars_to_keep("y", y1, by = "id")
-}
-
Clearing joyn environment
-Messages functions
-joyn_msg()
,
-joyn_msgs_exist()
,
-joyn_report()
,
-msg_type_dt()
,
-store_msg()
,
-style()
,
-type_choices()
if (FALSE) {
-# Storing a message
-joyn:::store_msg("info", "simple message")
-
-# Clearing the environment
-joyn:::clear_joynenv()
-
-# Checking it does not exist in the environment
-print(joyn:::joyn_msgs_exist())
-}
-
tabulate one variable frequencies
-data frame
character: name of variable to tabulate. Use Standard evaluation.
numeric: number of decimal places to display. Default is 1.
logical: report NA values in frequencies. Default is FALSE.
library(data.table)
-x4 = data.table(id1 = c(1, 1, 2, 3, 3),
- id2 = c(1, 1, 2, 3, 4),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-freq_table(x4, "id1")
-
This is a joyn
wrapper that works in a similar
-fashion to dplyr::full_join
full_join(
- x,
- y,
- by = intersect(names(x), names(y)),
- copy = FALSE,
- suffix = c(".x", ".y"),
- keep = NULL,
- na_matches = c("na", "never"),
- multiple = "all",
- unmatched = "drop",
- relationship = "one-to-one",
- y_vars_to_keep = TRUE,
- update_values = FALSE,
- update_NAs = update_values,
- reportvar = getOption("joyn.reportvar"),
- reporttype = c("factor", "character", "numeric"),
- roll = NULL,
- keep_common_vars = FALSE,
- sort = TRUE,
- verbose = getOption("joyn.verbose"),
- ...
-)
data frame: referred to as left in R terminology, or master in -Stata terminology.
data frame: referred to as right in R terminology, or using in -Stata terminology.
a character vector of variables to join by. If NULL, the default,
-joyn will do a natural join, using all variables with common names across
-the two tables. A message lists the variables so that you can check they're
-correct (to suppress the message, simply explicitly list the variables that
-you want to join). To join by different variables on x and y use a vector
-of expressions. For example, by = c("a = b", "z")
will use "a" in x
, "b"
-in y
, and "z" in both tables.
If x
and y
are not from the same data source,
-and copy
is TRUE
, then y
will be copied into the
-same src as x
. This allows you to join tables across srcs, but
-it is a potentially expensive operation so you must opt into it.
If there are non-joined duplicate variables in x
and
-y
, these suffixes will be added to the output to disambiguate them.
-Should be a character vector of length 2.
Should the join keys from both x
and y
be preserved in the
-output?
If NULL
, the default, joins on equality retain only the keys from x
,
-while joins on inequality retain the keys from both inputs.
If TRUE
, all keys from both inputs are retained.
If FALSE
, only keys from x
are retained. For right and full joins,
-the data in key columns corresponding to rows that only exist in y
are
-merged into the key columns from x
. Can't be used when joining on
-inequality conditions.
Should two NA
or two NaN
values match?
Handling of rows in x
with multiple matches in y
.
-For each row of x
:
"all"
, the default, returns every match detected in y
. This is the
-same behavior as SQL.
"any"
returns one match detected in y
, with no guarantees on which
-match will be returned. It is often faster than "first"
and "last"
-if you just need to detect if there is at least one match.
"first"
returns the first match detected in y
.
"last"
returns the last match detected in y
.
How should unmatched keys that would result in dropped rows -be handled?
"drop"
drops unmatched keys from the result.
"error"
throws an error if unmatched keys are detected.
unmatched
is intended to protect you from accidentally dropping rows
-during a join. It only checks for unmatched keys in the input that could
-potentially drop rows.
For left joins, it checks y
.
For right joins, it checks x
.
For inner joins, it checks both x
and y
. In this case, unmatched
is
-also allowed to be a character vector of length 2 to specify the behavior
-for x
and y
independently.
Handling of the expected relationship between the keys of
-x
and y
. If the expectations chosen from the list below are
-invalidated, an error is thrown.
NULL
, the default, doesn't expect there to be any relationship between
-x
and y
. However, for equality joins it will check for a many-to-many
-relationship (which is typically unexpected) and will warn if one occurs,
-encouraging you to either take a closer look at your inputs or make this
-relationship explicit by specifying "many-to-many"
.
See the Many-to-many relationships section for more details.
"one-to-one"
expects:
Each row in x
matches at most 1 row in y
.
Each row in y
matches at most 1 row in x
.
"one-to-many"
expects:
Each row in y
matches at most 1 row in x
.
"many-to-one"
expects:
Each row in x
matches at most 1 row in y
.
"many-to-many"
doesn't perform any relationship checks, but is provided
-to allow you to be explicit about this relationship if you know it
-exists.
relationship
doesn't handle cases where there are zero matches. For that,
-see unmatched
.
character: Vector of variable names in y
that will be
-kept after the merge. If TRUE (the default), it keeps all the brings all
-the variables in y into x. If FALSE or NULL, it does not bring any variable
-into x, but a report will be generated.
logical: If TRUE, it will update all values of variables
-in x with the actual of variables in y with the same name as the ones in x.
-NAs from y won't be used to update actual values in x. Yet, by default,
-NAs in x will be updated with values in y. To avoid this, make sure to set
-update_NAs = FALSE
logical: If TRUE, it will update NA values of all variables
-in x with actual values of variables in y that have the same name as the
-ones in x. If FALSE, NA values won't be updated, even if update_values
is
-TRUE
character: Name of reporting variable. Default is ".joyn". -This is the same as variable "_merge" in Stata after performing a merge. If -FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.
character: One of "character" or "numeric". Default is -"character". If "numeric", the reporting variable will contain numeric -codes of the source and the contents of each observation in the joined -table. See below for more information.
double: to be implemented
logical: If TRUE, it will keep the original variable -from y when both tables have common variable names. Thus, the prefix "y." -will be added to the original name to distinguish from the resulting -variable in the joined table.
logical: If TRUE, sort by key variables in by
. Default is
-FALSE.
logical: if FALSE, it won't display any message (programmer's -option). Default is TRUE.
Arguments passed on to joyn
match_type
character: one of "m:m", "m:1", "1:m", "1:1". -Default is "1:1" since this the most restrictive. However, following -Stata's recommendation, it is better to be explicit and use any of the -other three match types (See details in match types sections).
allow.cartesian
logical: Check documentation in official web site.
-Default is NULL
, which implies that if the join is "1:1" it will be
-FALSE
, but if the join has any "m" on it, it will be converted to TRUE
.
-By specifying TRUE
of FALSE
you force the behavior of the join.
suffixes
A character(2) specifying the suffixes to be used for making -non-by column names unique. The suffix behaviour works in a similar fashion -as the base::merge method does.
yvars
keep_y_in_x
msg_type
character: type of messages to display by default
na.last
logical
. If TRUE
, missing values in the data are placed last; if FALSE
, they are placed first; if NA
they are removed.
-na.last=NA
is valid only for x[order(., na.last)]
and its
-default is TRUE
. setorder
and setorderv
only accept
-TRUE
/FALSE
with default FALSE
.
An data frame of the same class as x
. The properties of the output
-are as close as possible to the ones returned by the dplyr alternative.
Other dplyr alternatives:
-anti_join()
,
-inner_join()
,
-left_join()
,
-right_join()
# Simple full join
-library(data.table)
-
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-full_join(x1, y1, relationship = "many-to-one")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 33.3%
-#> 2 y 1 16.7%
-#> 3 x & y 3 50%
-#> 4 total 6 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: 4 NA NA 16 y
-#> 6: NA NA 15 NA x
-
This function aims to display and store info on joyn options
-environment, which is joyn environment by default
logical, if TRUE displays (i.e., print) info on joyn options and -corresponding default and current values
character or NULL. If character, name of a specific joyn option. -If NULL, all joyn options
JOYn options functions
-set_joyn_options()
if (FALSE) {
-
-# display all joyn options, their default and current values
-joyn:::get_joyn_options()
-
-# store list of option = value pairs AND do not display info
-joyn_options <- joyn:::get_joyn_options(display = FALSE)
-
-# get info on one specific option and store it
-joyn.verbose <- joyn:::get_joyn_options(option = "joyn.verbose")
-
-# get info on two specific option
-joyn:::get_joyn_options(option = c("joyn.verbose", "joyn.reportvar"))
-
-}
-
Since the objective of joyn is to join tables with joy, there is only one main function in this package
- - -joyn()
- full_join()
- right_join()
- left_join()
- inner_join()
- anti_join()
- merge()
- joyn_msg()
- joyn_report()
- get_joyn_options()
- set_joyn_options()
- is_id()
- by
variablefreq_table()
- possible_ids()
- is_balanced()
- rename_to_valid()
- joyn_msg()
- This is a joyn
wrapper that works in a similar fashion to
-dplyr::inner_join
inner_join(
- x,
- y,
- by = intersect(names(x), names(y)),
- copy = FALSE,
- suffix = c(".x", ".y"),
- keep = NULL,
- na_matches = c("na", "never"),
- multiple = "all",
- unmatched = "drop",
- relationship = "one-to-one",
- y_vars_to_keep = TRUE,
- update_values = FALSE,
- update_NAs = update_values,
- reportvar = getOption("joyn.reportvar"),
- reporttype = c("factor", "character", "numeric"),
- roll = NULL,
- keep_common_vars = FALSE,
- sort = TRUE,
- verbose = getOption("joyn.verbose"),
- ...
-)
data frame: referred to as left in R terminology, or master in -Stata terminology.
data frame: referred to as right in R terminology, or using in -Stata terminology.
a character vector of variables to join by. If NULL, the default,
-joyn will do a natural join, using all variables with common names across
-the two tables. A message lists the variables so that you can check they're
-correct (to suppress the message, simply explicitly list the variables that
-you want to join). To join by different variables on x and y use a vector
-of expressions. For example, by = c("a = b", "z")
will use "a" in x
,
-"b" in y
, and "z" in both tables.
If x
and y
are not from the same data source,
-and copy
is TRUE
, then y
will be copied into the
-same src as x
. This allows you to join tables across srcs, but
-it is a potentially expensive operation so you must opt into it.
If there are non-joined duplicate variables in x
and
-y
, these suffixes will be added to the output to disambiguate them.
-Should be a character vector of length 2.
Should the join keys from both x
and y
be preserved in the
-output?
If NULL
, the default, joins on equality retain only the keys from x
,
-while joins on inequality retain the keys from both inputs.
If TRUE
, all keys from both inputs are retained.
If FALSE
, only keys from x
are retained. For right and full joins,
-the data in key columns corresponding to rows that only exist in y
are
-merged into the key columns from x
. Can't be used when joining on
-inequality conditions.
Should two NA
or two NaN
values match?
Handling of rows in x
with multiple matches in y
.
-For each row of x
:
"all"
, the default, returns every match detected in y
. This is the
-same behavior as SQL.
"any"
returns one match detected in y
, with no guarantees on which
-match will be returned. It is often faster than "first"
and "last"
-if you just need to detect if there is at least one match.
"first"
returns the first match detected in y
.
"last"
returns the last match detected in y
.
How should unmatched keys that would result in dropped rows -be handled?
"drop"
drops unmatched keys from the result.
"error"
throws an error if unmatched keys are detected.
unmatched
is intended to protect you from accidentally dropping rows
-during a join. It only checks for unmatched keys in the input that could
-potentially drop rows.
For left joins, it checks y
.
For right joins, it checks x
.
For inner joins, it checks both x
and y
. In this case, unmatched
is
-also allowed to be a character vector of length 2 to specify the behavior
-for x
and y
independently.
Handling of the expected relationship between the keys of
-x
and y
. If the expectations chosen from the list below are
-invalidated, an error is thrown.
NULL
, the default, doesn't expect there to be any relationship between
-x
and y
. However, for equality joins it will check for a many-to-many
-relationship (which is typically unexpected) and will warn if one occurs,
-encouraging you to either take a closer look at your inputs or make this
-relationship explicit by specifying "many-to-many"
.
See the Many-to-many relationships section for more details.
"one-to-one"
expects:
Each row in x
matches at most 1 row in y
.
Each row in y
matches at most 1 row in x
.
"one-to-many"
expects:
Each row in y
matches at most 1 row in x
.
"many-to-one"
expects:
Each row in x
matches at most 1 row in y
.
"many-to-many"
doesn't perform any relationship checks, but is provided
-to allow you to be explicit about this relationship if you know it
-exists.
relationship
doesn't handle cases where there are zero matches. For that,
-see unmatched
.
character: Vector of variable names in y
that will be
-kept after the merge. If TRUE (the default), it keeps all the brings all
-the variables in y into x. If FALSE or NULL, it does not bring any variable
-into x, but a report will be generated.
logical: If TRUE, it will update all values of variables
-in x with the actual of variables in y with the same name as the ones in x.
-NAs from y won't be used to update actual values in x. Yet, by default,
-NAs in x will be updated with values in y. To avoid this, make sure to set
-update_NAs = FALSE
logical: If TRUE, it will update NA values of all variables
-in x with actual values of variables in y that have the same name as the
-ones in x. If FALSE, NA values won't be updated, even if update_values
is
-TRUE
character: Name of reporting variable. Default is ".joyn". -This is the same as variable "_merge" in Stata after performing a merge. If -FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.
character: One of "character" or "numeric". Default is -"character". If "numeric", the reporting variable will contain numeric -codes of the source and the contents of each observation in the joined -table. See below for more information.
double: to be implemented
logical: If TRUE, it will keep the original variable -from y when both tables have common variable names. Thus, the prefix "y." -will be added to the original name to distinguish from the resulting -variable in the joined table.
logical: If TRUE, sort by key variables in by
. Default is
-FALSE.
logical: if FALSE, it won't display any message (programmer's -option). Default is TRUE.
Arguments passed on to joyn
match_type
character: one of "m:m", "m:1", "1:m", "1:1". -Default is "1:1" since this the most restrictive. However, following -Stata's recommendation, it is better to be explicit and use any of the -other three match types (See details in match types sections).
allow.cartesian
logical: Check documentation in official web site.
-Default is NULL
, which implies that if the join is "1:1" it will be
-FALSE
, but if the join has any "m" on it, it will be converted to TRUE
.
-By specifying TRUE
of FALSE
you force the behavior of the join.
suffixes
A character(2) specifying the suffixes to be used for making -non-by column names unique. The suffix behaviour works in a similar fashion -as the base::merge method does.
yvars
keep_y_in_x
msg_type
character: type of messages to display by default
na.last
logical
. If TRUE
, missing values in the data are placed last; if FALSE
, they are placed first; if NA
they are removed.
-na.last=NA
is valid only for x[order(., na.last)]
and its
-default is TRUE
. setorder
and setorderv
only accept
-TRUE
/FALSE
with default FALSE
.
An data frame of the same class as x
. The properties of the output
-are as close as possible to the ones returned by the dplyr alternative.
Other dplyr alternatives:
-anti_join()
,
-full_join()
,
-left_join()
,
-right_join()
# Simple full join
-library(data.table)
-
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-inner_join(x1, y1, relationship = "many-to-one")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 66.7%
-#> 2 y 1 33.3%
-#> 3 total 3 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-
Check if the data frame is balanced by group of columns, i.e., if it contains every combination of the elements in the specified variables
-is_balanced(df, by, return = c("logic", "table"))
data frame
character: variables used to check if df
is balanced
character: either "logic" or "table". If "logic", returns TRUE
-or FALSE
depending on whether data frame is balanced. If "table" returns the unbalanced
-observations - i.e. the combinations of elements in specified variables not found in input df
x1 = data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-is_balanced(df = x1,
- by = c("id", "t"),
- return = "table") # returns combination of elements in "id" and "t" not present in df
-#> id t
-#> 1 3 1
-#> 2 2 2
-is_balanced(df = x1,
- by = c("id", "t"),
- return = "logic") # FALSE
-#> [1] FALSE
-
report if dt is uniquely identified by by
var or, if report = TRUE, the duplicates in by
variable
is_id(dt, by, verbose = getOption("joyn.verbose"), return_report = FALSE)
either right of left table
variable to merge by
logical: if TRUE messages will be displayed
logical: if TRUE, returns data with summary of duplicates.
-If FALSE, returns logical value depending on whether dt
is uniquely identified
-by by
library(data.table)
-
-# example with data frame not uniquely identified by `by` var
-
-y <- data.table(id = c("c","b", "c", "a"),
- y = c(11L, 15L, 18L, 20L))
-is_id(y, by = "id")
-#>
-#> ── Duplicates in terms of `id`
-#> copies n percent
-#> 1 1 2 66.7%
-#> 2 2 1 33.3%
-#> 3 total 3 100%
-#> ─────────────────────────────────────────────────────── End of is_id() report ──
-#> [1] FALSE
-is_id(y, by = "id", return_report = TRUE)
-#>
-#> ── Duplicates in terms of `id`
-#> copies n percent
-#> 1 1 2 66.7%
-#> 2 2 1 33.3%
-#> 3 total 3 100%
-#> ─────────────────────────────────────────────────────── End of is_id() report ──
-#> id copies
-#> <char> <int>
-#> 1: c 2
-#> 2: b 1
-#> 3: a 1
-
-# example with data frame uniquely identified by `by` var
-
-y1 <- data.table(id = c("1","3", "2", "9"),
- y = c(11L, 15L, 18L, 20L))
-is_id(y1, by = "id")
-#>
-#> ── Duplicates in terms of `id`
-#> copies n percent
-#> 1 1 4 100%
-#> 2 total 4 100%
-#> ─────────────────────────────────────────────────────── End of is_id() report ──
-#> [1] TRUE
-
Confirm if match type error
-name of data frame
A vector of shared column names in x
and y
to merge on.
-This defaults to the shared key columns between the two tables.
-If y
has no key columns, this defaults to the key of x
.
logical: from existing code
if (FALSE) {
-# example with dt not uniquely identified by "id"
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-joyn:::is_match_type_error(x1, name = "x1", by = "id")
-}
-
When "many" relationship is specified, check if it is valid.
(Specified many relationship not valid if the dt is instead uniquely identified by specified keys)
if (FALSE) {
-# example with data frame uniquely identified by specified `by` vars
-x1 = data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-
-joyn:::is_valid_m_key(x1, by = c("id", "t"))
-# example with valid specified "many" relationship
-x2 = data.frame(id = c(1L, 1L, 1L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-joyn:::is_valid_m_key(x2, by = c("id", "t"))
-}
-
R/joyn-package.R
- joyn-package.Rd
Tool for diagnosing table joins. It combines the speed of `collapse` and `data.table`, the flexibility of `dplyr`, and the diagnosis and features of the `merge` command in `Stata`.
-Maintainer: R.Andres Castaneda acastanedaa@worldbank.org
-Authors:
Zander Prinsloo zprinsloo@worldbank.org
Rossana Tatulli rtatulli@worldbank.org
This is the primary function in the joyn
package. It executes a full join,
-performs a number of checks, and filters to allow the user-specified join.
joyn(
- x,
- y,
- by = intersect(names(x), names(y)),
- match_type = c("1:1", "1:m", "m:1", "m:m"),
- keep = c("full", "left", "master", "right", "using", "inner", "anti"),
- y_vars_to_keep = ifelse(keep == "anti", FALSE, TRUE),
- update_values = FALSE,
- update_NAs = update_values,
- reportvar = getOption("joyn.reportvar"),
- reporttype = c("factor", "character", "numeric"),
- roll = NULL,
- keep_common_vars = FALSE,
- sort = FALSE,
- verbose = getOption("joyn.verbose"),
- suffixes = getOption("joyn.suffixes"),
- allow.cartesian = deprecated(),
- yvars = deprecated(),
- keep_y_in_x = deprecated(),
- na.last = getOption("joyn.na.last"),
- msg_type = getOption("joyn.msg_type")
-)
data frame: referred to as left in R terminology, or master in -Stata terminology.
data frame: referred to as right in R terminology, or using in -Stata terminology.
a character vector of variables to join by. If NULL, the default,
-joyn will do a natural join, using all variables with common names across
-the two tables. A message lists the variables so that you can check they're
-correct (to suppress the message, simply explicitly list the variables that
-you want to join). To join by different variables on x and y use a vector
-of expressions. For example, by = c("a = b", "z")
will use "a" in x
,
-"b" in y
, and "z" in both tables.
character: one of "m:m", "m:1", "1:m", "1:1". -Default is "1:1" since this the most restrictive. However, following -Stata's recommendation, it is better to be explicit and use any of the -other three match types (See details in match types sections).
atomic character vector of length 1: One of "full", "left",
-"master", "right",
-"using", "inner". Default is "full". Even though this is not the
-regular behavior of joins in R, the objective of joyn
is to present a
-diagnosis of the join which requires a full join. That is why the default
-is a a full join. Yet, if "left" or "master", it keeps the observations
-that matched in both tables and the ones that did not match in x. The ones
-in y will be discarded. If "right" or "using", it keeps the
-observations that matched in both tables and the ones that did not match in
-y. The ones in x will be discarded. If "inner", it only keeps the
-observations that matched both tables. Note that if, for example, a keep = "left", the
joyn()function still executes a full join under the hood and then filters so that only rows the output table is a left join. This behaviour, while inefficient, allows all the diagnostics and checks conducted by
joyn`.
character: Vector of variable names in y
that will be
-kept after the merge. If TRUE (the default), it keeps all the brings all
-the variables in y into x. If FALSE or NULL, it does not bring any variable
-into x, but a report will be generated.
logical: If TRUE, it will update all values of variables
-in x with the actual of variables in y with the same name as the ones in x.
-NAs from y won't be used to update actual values in x. Yet, by default,
-NAs in x will be updated with values in y. To avoid this, make sure to set
-update_NAs = FALSE
logical: If TRUE, it will update NA values of all variables
-in x with actual values of variables in y that have the same name as the
-ones in x. If FALSE, NA values won't be updated, even if update_values
is
-TRUE
character: Name of reporting variable. Default is ".joyn". -This is the same as variable "_merge" in Stata after performing a merge. If -FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.
character: One of "character" or "numeric". Default is -"character". If "numeric", the reporting variable will contain numeric -codes of the source and the contents of each observation in the joined -table. See below for more information.
double: to be implemented
logical: If TRUE, it will keep the original variable -from y when both tables have common variable names. Thus, the prefix "y." -will be added to the original name to distinguish from the resulting -variable in the joined table.
logical: If TRUE, sort by key variables in by
. Default is
-FALSE.
logical: if FALSE, it won't display any message (programmer's -option). Default is TRUE.
A character(2) specifying the suffixes to be used for making -non-by column names unique. The suffix behaviour works in a similar fashion -as the base::merge method does.
logical: Check documentation in official web site.
-Default is NULL
, which implies that if the join is "1:1" it will be
-FALSE
, but if the join has any "m" on it, it will be converted to TRUE
.
-By specifying TRUE
of FALSE
you force the behavior of the join.
logical
. If TRUE
, missing values in the data are placed last; if FALSE
, they are placed first; if NA
they are removed.
-na.last=NA
is valid only for x[order(., na.last)]
and its
-default is TRUE
. setorder
and setorderv
only accept
-TRUE
/FALSE
with default FALSE
.
character: type of messages to display by default
Using the same wording of the Stata manual
-1:1: specifies a one-to-one match merge. The variables specified in
-by
uniquely identify single observations in both table.
1:m and m:1: specify one-to-many and many-to-one match merges,
-respectively. This means that in of the tables the observations are
-uniquely identify by the variables in by
, while in the other table many
-(two or more) of the observations are identify by the variables in by
m:m refers to many-to-many merge. variables in by
does not uniquely
-identify the observations in either table. Matching is performed by
-combining observations with equal values in by
; within matching values,
-the first observation in the master (i.e. left or x) table is matched with
-the first matching observation in the using (i.e. right or y) table; the
-second, with the second; and so on. If there is an unequal number of
-observations within a group, then the last observation of the shorter group
-is used repeatedly to match with subsequent observations of the longer
-group.
If reporttype = "numeric"
, then the numeric values have the following
-meaning:
1: row comes from x
, i.e. "x" 2: row comes from y
, i.e. "y" 3: row from
-both x
and y
, i.e. "x & y" 4: row has NA in x
that has been updated
-with y
, i.e. "NA updated" 5: row has valued in x
that has been updated
-with y
, i.e. "value updated" 6: row from x
that has not been updated,
-i.e. "not updated"
NA
s are placed either at first or at last in the
-resulting data.frame depending on the value of getOption("joyn.na.last")
.
-The Default is FALSE
as it is the default value of
-data.table::setorderv.
# Simple join
-library(data.table)
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
-t = c(1L, 2L, 1L, 2L, NA_integer_),
-x = 11:15)
-
-y1 = data.table(id = 1:2,
- y = c(11L, 15L))
-
-x2 = data.table(id = c(1, 1, 2, 3, NA),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-
-y2 = data.table(id = c(1, 2, 5, 6, 3),
- yd = c(1, 2, 5, 6, 3),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-joyn(x1, y1, match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 x & y 3 60%
-#> 3 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <int> <int> <int> <int> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-
-# Bad merge for not specifying by argument or match_type
-joyn(x2, y2)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 4 44.4%
-#> 2 y 4 44.4%
-#> 3 x & y 1 11.1%
-#> 4 total 9 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id and x from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 1 2 12 NA NA x
-#> 3: 2 1 NA NA NA x
-#> 4: 3 2 NA NA NA x
-#> 5: NA NA 15 NA NA x
-#> 6: 2 NA 17 2 15 y
-#> 7: 5 NA 18 5 20 y
-#> 8: 6 NA 19 6 13 y
-#> 9: 3 NA 20 3 10 y
-
-# good merge, ignoring variable x from y
-joyn(x2, y2, by = "id", match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 1 14.3%
-#> 2 y 2 28.6%
-#> 3 x & y 4 57.1%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 1 2 12 1 11 x & y
-#> 3: 2 1 NA 2 15 x & y
-#> 4: 3 2 NA 3 10 x & y
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA NA 5 20 y
-#> 7: 6 NA NA 6 13 y
-
-# update NAs in x variable form x
-joyn(x2, y2, by = "id", update_NAs = TRUE, match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 1 14.3%
-#> 2 x & y 2 28.6%
-#> 3 NA updated 4 57.1%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 x & y
-#> 2: 1 2 12 1 11 x & y
-#> 3: 2 1 17 2 15 NA updated
-#> 4: 3 2 20 3 10 NA updated
-#> 5: NA NA 15 NA NA x
-#> 6: 5 NA 18 5 20 NA updated
-#> 7: 6 NA 19 6 13 NA updated
-
-# Update values in x with variables from y
-joyn(x2, y2, by = "id", update_values = TRUE, match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 NA updated 4 57.1%
-#> 2 value updated 2 28.6%
-#> 3 not updated 1 14.3%
-#> 4 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id, yd, y, and x
-#> id t x yd y .joyn
-#> <num> <int> <num> <num> <int> <fctr>
-#> 1: 1 1 16 1 11 value updated
-#> 2: 1 2 16 1 11 value updated
-#> 3: 2 1 17 2 15 NA updated
-#> 4: 3 2 20 3 10 NA updated
-#> 5: NA NA 15 NA NA not updated
-#> 6: 5 NA 18 5 20 NA updated
-#> 7: 6 NA 19 6 13 NA updated
-
-
display type of joyn message
-joyn_msg(msg_type = getOption("joyn.msg_type"), msg = NULL)
character: one or more of the following: -all, basic, info, note, warn, timing, or err
character vector to be parsed to cli::cli_abort()
. Default is
-NULL. It only works if "err" %in% msg_type
. This is an internal argument.
Messages functions
-clear_joynenv()
,
-joyn_msgs_exist()
,
-joyn_report()
,
-msg_type_dt()
,
-store_msg()
,
-style()
,
-type_choices()
library(data.table)
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
-t = c(1L, 2L, 1L, 2L, NA_integer_),
-x = 11:15)
-
-y1 = data.table(id = 1:2,
- y = c(11L, 15L))
-df <- joyn(x1, y1, match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 x & y 3 60%
-#> 3 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-joyn_msg("basic")
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-joyn_msg("all")
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> ● Timing:The full joyn is executed in 0.000205 seconds.
-#> ● Timing: The entire joyn function, including checks, is executed in 0.025927
-#> seconds.
-
Checks the presence of joyn messages stored in joyn environment
-Messages functions
-clear_joynenv()
,
-joyn_msg()
,
-joyn_report()
,
-msg_type_dt()
,
-store_msg()
,
-style()
,
-type_choices()
Print JOYn report table
-joyn_report(verbose = getOption("joyn.verbose"))
logical: if FALSE, it won't display any message (programmer's -option). Default is TRUE.
Messages functions
-clear_joynenv()
,
-joyn_msg()
,
-joyn_msgs_exist()
,
-msg_type_dt()
,
-store_msg()
,
-style()
,
-type_choices()
library(data.table)
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
-t = c(1L, 2L, 1L, 2L, NA_integer_),
-x = 11:15)
-
-y1 = data.table(id = 1:2,
- y = c(11L, 15L))
-
-d <- joyn(x1, y1, match_type = "m:1")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 x & y 3 60%
-#> 3 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-joyn_report(verbose = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 x & y 3 60%
-#> 3 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-
joyn
R/joyn_workhorse.R
- joyn_workhorse.Rd
Always executes a full join.
-data object, "left" or "master"
data object, "right" or "using"
atomic character vector: key specifying join
logical: sort the result by the columns in by
-x
and y
atomic character vector: give suffixes to columns common to both
if (FALSE) {
-# Full join
-library(data.table)
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-joyn:::joyn_workhorse(x = x1, y=y1)
-}
-
This is a joyn
wrapper that works in a similar
-fashion to dplyr::left_join
left_join(
- x,
- y,
- by = intersect(names(x), names(y)),
- copy = FALSE,
- suffix = c(".x", ".y"),
- keep = NULL,
- na_matches = c("na", "never"),
- multiple = "all",
- unmatched = "drop",
- relationship = NULL,
- y_vars_to_keep = TRUE,
- update_values = FALSE,
- update_NAs = update_values,
- reportvar = getOption("joyn.reportvar"),
- reporttype = c("factor", "character", "numeric"),
- roll = NULL,
- keep_common_vars = FALSE,
- sort = TRUE,
- verbose = getOption("joyn.verbose"),
- ...
-)
data frame: referred to as left in R terminology, or master in -Stata terminology.
data frame: referred to as right in R terminology, or using in -Stata terminology.
a character vector of variables to join by. If NULL, the default,
-joyn will do a natural join, using all variables with common names across
-the two tables. A message lists the variables so that you can check they're
-correct (to suppress the message, simply explicitly list the variables that
-you want to join). To join by different variables on x and y use a vector
-of expressions. For example, by = c("a = b", "z")
will use "a" in x
, "b"
-in y
, and "z" in both tables.
If x
and y
are not from the same data source,
-and copy
is TRUE
, then y
will be copied into the
-same src as x
. This allows you to join tables across srcs, but
-it is a potentially expensive operation so you must opt into it.
If there are non-joined duplicate variables in x
and
-y
, these suffixes will be added to the output to disambiguate them.
-Should be a character vector of length 2.
Should the join keys from both x
and y
be preserved in the
-output?
If NULL
, the default, joins on equality retain only the keys from x
,
-while joins on inequality retain the keys from both inputs.
If TRUE
, all keys from both inputs are retained.
If FALSE
, only keys from x
are retained. For right and full joins,
-the data in key columns corresponding to rows that only exist in y
are
-merged into the key columns from x
. Can't be used when joining on
-inequality conditions.
Should two NA
or two NaN
values match?
Handling of rows in x
with multiple matches in y
.
-For each row of x
:
"all"
, the default, returns every match detected in y
. This is the
-same behavior as SQL.
"any"
returns one match detected in y
, with no guarantees on which
-match will be returned. It is often faster than "first"
and "last"
-if you just need to detect if there is at least one match.
"first"
returns the first match detected in y
.
"last"
returns the last match detected in y
.
How should unmatched keys that would result in dropped rows -be handled?
"drop"
drops unmatched keys from the result.
"error"
throws an error if unmatched keys are detected.
unmatched
is intended to protect you from accidentally dropping rows
-during a join. It only checks for unmatched keys in the input that could
-potentially drop rows.
For left joins, it checks y
.
For right joins, it checks x
.
For inner joins, it checks both x
and y
. In this case, unmatched
is
-also allowed to be a character vector of length 2 to specify the behavior
-for x
and y
independently.
Handling of the expected relationship between the keys of
-x
and y
. If the expectations chosen from the list below are
-invalidated, an error is thrown.
NULL
, the default, doesn't expect there to be any relationship between
-x
and y
. However, for equality joins it will check for a many-to-many
-relationship (which is typically unexpected) and will warn if one occurs,
-encouraging you to either take a closer look at your inputs or make this
-relationship explicit by specifying "many-to-many"
.
See the Many-to-many relationships section for more details.
"one-to-one"
expects:
Each row in x
matches at most 1 row in y
.
Each row in y
matches at most 1 row in x
.
"one-to-many"
expects:
Each row in y
matches at most 1 row in x
.
"many-to-one"
expects:
Each row in x
matches at most 1 row in y
.
"many-to-many"
doesn't perform any relationship checks, but is provided
-to allow you to be explicit about this relationship if you know it
-exists.
relationship
doesn't handle cases where there are zero matches. For that,
-see unmatched
.
character: Vector of variable names in y
that will be
-kept after the merge. If TRUE (the default), it keeps all the brings all
-the variables in y into x. If FALSE or NULL, it does not bring any variable
-into x, but a report will be generated.
logical: If TRUE, it will update all values of variables
-in x with the actual of variables in y with the same name as the ones in x.
-NAs from y won't be used to update actual values in x. Yet, by default,
-NAs in x will be updated with values in y. To avoid this, make sure to set
-update_NAs = FALSE
logical: If TRUE, it will update NA values of all variables
-in x with actual values of variables in y that have the same name as the
-ones in x. If FALSE, NA values won't be updated, even if update_values
is
-TRUE
character: Name of reporting variable. Default is ".joyn". -This is the same as variable "_merge" in Stata after performing a merge. If -FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.
character: One of "character" or "numeric". Default is -"character". If "numeric", the reporting variable will contain numeric -codes of the source and the contents of each observation in the joined -table. See below for more information.
double: to be implemented
logical: If TRUE, it will keep the original variable -from y when both tables have common variable names. Thus, the prefix "y." -will be added to the original name to distinguish from the resulting -variable in the joined table.
logical: If TRUE, sort by key variables in by
. Default is
-FALSE.
logical: if FALSE, it won't display any message (programmer's -option). Default is TRUE.
Arguments passed on to joyn
match_type
character: one of "m:m", "m:1", "1:m", "1:1". -Default is "1:1" since this the most restrictive. However, following -Stata's recommendation, it is better to be explicit and use any of the -other three match types (See details in match types sections).
allow.cartesian
logical: Check documentation in official web site.
-Default is NULL
, which implies that if the join is "1:1" it will be
-FALSE
, but if the join has any "m" on it, it will be converted to TRUE
.
-By specifying TRUE
of FALSE
you force the behavior of the join.
suffixes
A character(2) specifying the suffixes to be used for making -non-by column names unique. The suffix behaviour works in a similar fashion -as the base::merge method does.
yvars
keep_y_in_x
msg_type
character: type of messages to display by default
na.last
logical
. If TRUE
, missing values in the data are placed last; if FALSE
, they are placed first; if NA
they are removed.
-na.last=NA
is valid only for x[order(., na.last)]
and its
-default is TRUE
. setorder
and setorderv
only accept
-TRUE
/FALSE
with default FALSE
.
An data frame of the same class as x
. The properties of the output
-are as close as possible to the ones returned by the dplyr alternative.
Other dplyr alternatives:
-anti_join()
,
-full_join()
,
-inner_join()
,
-right_join()
# Simple left join
-library(data.table)
-
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-left_join(x1, y1, relationship = "many-to-one")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 40%
-#> 2 y 1 20%
-#> 3 x & y 2 40%
-#> 4 total 5 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 3 2 14 NA x
-#> 5: NA NA 15 NA x
-
This is a joyn wrapper that works in a similar fashion to base::merge and -data.table::merge, which is why merge masks the other two.
-merge(
- x,
- y,
- by = NULL,
- by.x = NULL,
- by.y = NULL,
- all = FALSE,
- all.x = all,
- all.y = all,
- sort = TRUE,
- suffixes = c(".x", ".y"),
- no.dups = TRUE,
- allow.cartesian = getOption("datatable.allow.cartesian"),
- match_type = c("m:m", "m:1", "1:m", "1:1"),
- keep_common_vars = TRUE,
- ...
-)
data table
s. y
is coerced to a data.table
if
-it isn't one already.
A vector of shared column names in x
and y
to merge on.
-This defaults to the shared key columns between the two tables.
-If y
has no key columns, this defaults to the key of x
.
Vectors of column names in x
and y
to merge on.
logical; all = TRUE
is shorthand to save setting both
-all.x = TRUE
and all.y = TRUE
.
logical; if TRUE
, rows from x
which have no matching row
-in y
are included. These rows will have 'NA's in the columns that are usually
-filled with values from y
. The default is FALSE
so that only rows with
-data from both x
and y
are included in the output.
logical; analogous to all.x
above.
logical. If TRUE
(default), the rows of the merged
-data.table
are sorted by setting the key to the by / by.x
columns. If
-FALSE
, unlike base R's merge
for which row order is unspecified, the
-row order in x
is retained (including retaining the position of missing entries when
-all.x=TRUE
), followed by y
rows that don't match x
(when all.y=TRUE
)
-retaining the order those appear in y
.
A character(2)
specifying the suffixes to be used for
-making non-by
column names unique. The suffix behaviour works in a similar
-fashion as the merge.data.frame
method does.
logical indicating that suffixes
are also appended to
-non-by.y
column names in y
when they have the same column name
-as any by.x
.
See allow.cartesian
in [.data.table
.
character: one of "m:m", "m:1", "1:m", "1:1". -Default is "1:1" since this the most restrictive. However, following -Stata's recommendation, it is better to be explicit and use any of the -other three match types (See details in match types sections).
logical: If TRUE, it will keep the original variable -from y when both tables have common variable names. Thus, the prefix "y." -will be added to the original name to distinguish from the resulting -variable in the joined table.
Arguments passed on to joyn
y_vars_to_keep
character: Vector of variable names in y
that will be
-kept after the merge. If TRUE (the default), it keeps all the brings all
-the variables in y into x. If FALSE or NULL, it does not bring any variable
-into x, but a report will be generated.
reportvar
character: Name of reporting variable. Default is ".joyn". -This is the same as variable "_merge" in Stata after performing a merge. If -FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.
update_NAs
logical: If TRUE, it will update NA values of all variables
-in x with actual values of variables in y that have the same name as the
-ones in x. If FALSE, NA values won't be updated, even if update_values
is
-TRUE
update_values
logical: If TRUE, it will update all values of variables
-in x with the actual of variables in y with the same name as the ones in x.
-NAs from y won't be used to update actual values in x. Yet, by default,
-NAs in x will be updated with values in y. To avoid this, make sure to set
-update_NAs = FALSE
verbose
logical: if FALSE, it won't display any message (programmer's -option). Default is TRUE.
x1 = data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.frame(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-joyn::merge(x1, y1, by = "id")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 66.7%
-#> 2 y 1 33.3%
-#> 3 total 3 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> ⚠ Warning: Supplied both by and by.x/by.y. by argument will be ignored.
-#> ⚠ Warning: The keys supplied uniquely identify y, therefore a m:1 join is
-#> executed
-#> id t x y .joyn
-#> 1 1 1 11 11 x & y
-#> 2 1 2 12 11 x & y
-#> 3 2 1 13 15 x & y
-# example of using by.x and by.y
-x2 = data.frame(id1 = c(1, 1, 2, 3, 3),
- id2 = c(1, 1, 2, 3, 4),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-y2 = data.frame(id = c(1, 2, 5, 6, 3),
- id2 = c(1, 1, 2, 3, 4),
- y = c(11L, 15L, 20L, 13L, 10L),
- x = c(16:20))
-jn <- joyn::merge(x2,
- y2,
- match_type = "m:m",
- all.x = TRUE,
- by.x = "id1",
- by.y = "id2")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 y 1 14.3%
-#> 2 x & y 6 85.7%
-#> 3 total 7 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables keyby1 from id, keyby1, y, and x
-#> ⚠ Warning: Supplied both by and by.x/by.y. by argument will be ignored.
-# example with all = TRUE
-jn <- joyn::merge(x2,
- y2,
- match_type = "m:m",
- by.x = "id1",
- by.y = "id2",
- all = TRUE)
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 y 1 12.5%
-#> 2 x & y 7 87.5%
-#> 3 total 8 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables keyby1 from id, keyby1, y, and x
-#> ⚠ Warning: Supplied both by and by.x/by.y. by argument will be ignored.
-
R/info_display.R
- msg_type_dt.Rd
convert style of joyn message to data frame containing type and message
-Messages functions
-clear_joynenv()
,
-joyn_msg()
,
-joyn_msgs_exist()
,
-joyn_report()
,
-store_msg()
,
-style()
,
-type_choices()
Identify possible variables uniquely identifying x
-possible_ids(
- dt,
- exclude = NULL,
- include = NULL,
- verbose = getOption("possible_ids.verbose")
-)
data frame
character: Exclude variables to be selected as identifiers. It -could be either the name of the variables of one type of the variable -prefixed by "_". For instance, "_numeric" or "_character".
character: Name of variable to be included, that might belong
-to the group excluded in the exclude
logical: If FALSE no message will be displayed. Default is -TRUE
library(data.table)
-x4 = data.table(id1 = c(1, 1, 2, 3, 3),
- id2 = c(1, 1, 2, 3, 4),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = c(16, 12, NA, NA, 15))
-possible_ids(x4)
-#> ✔ There are no duplicates in data frame
-#> → we found 5 possible ids
-#> $V1
-#> [1] "id1" "t"
-#>
-#> $V2
-#> [1] "id1" "x"
-#>
-#> $V3
-#> [1] "id2" "t"
-#>
-#> $V4
-#> [1] "id2" "x"
-#>
-#> $V5
-#> [1] "t" "x"
-#>
-
Gives as output a vector of names to be used for the specified
-table that correspond to the by
argument for that table
process_by_vector(by, input = c("left", "right"))
character vector: by argument for join
character: either "left" or "right", indicating
-whether to give the left or right side of the equals ("=") if
-the equals is part of the by
vector
joyn:::process_by_vector(by = c("An = foo", "example"), input = "left")
-#> [1] "An" "example"
-
Rename to syntactically valid names
-rename_to_valid(name, verbose = getOption("joyn.verbose"))
This is a joyn
wrapper that works in a similar
-fashion to dplyr::right_join
right_join(
- x,
- y,
- by = intersect(names(x), names(y)),
- copy = FALSE,
- suffix = c(".x", ".y"),
- keep = NULL,
- na_matches = c("na", "never"),
- multiple = "all",
- unmatched = "drop",
- relationship = "one-to-one",
- y_vars_to_keep = TRUE,
- update_values = FALSE,
- update_NAs = update_values,
- reportvar = getOption("joyn.reportvar"),
- reporttype = c("factor", "character", "numeric"),
- roll = NULL,
- keep_common_vars = FALSE,
- sort = TRUE,
- verbose = getOption("joyn.verbose"),
- ...
-)
data frame: referred to as left in R terminology, or master in -Stata terminology.
data frame: referred to as right in R terminology, or using in -Stata terminology.
a character vector of variables to join by. If NULL, the default,
-joyn will do a natural join, using all variables with common names across
-the two tables. A message lists the variables so that you can check they're
-correct (to suppress the message, simply explicitly list the variables that
-you want to join). To join by different variables on x and y use a vector
-of expressions. For example, by = c("a = b", "z")
will use "a" in x
, "b"
-in y
, and "z" in both tables.
If x
and y
are not from the same data source,
-and copy
is TRUE
, then y
will be copied into the
-same src as x
. This allows you to join tables across srcs, but
-it is a potentially expensive operation so you must opt into it.
If there are non-joined duplicate variables in x
and
-y
, these suffixes will be added to the output to disambiguate them.
-Should be a character vector of length 2.
Should the join keys from both x
and y
be preserved in the
-output?
If NULL
, the default, joins on equality retain only the keys from x
,
-while joins on inequality retain the keys from both inputs.
If TRUE
, all keys from both inputs are retained.
If FALSE
, only keys from x
are retained. For right and full joins,
-the data in key columns corresponding to rows that only exist in y
are
-merged into the key columns from x
. Can't be used when joining on
-inequality conditions.
Should two NA
or two NaN
values match?
Handling of rows in x
with multiple matches in y
.
-For each row of x
:
"all"
, the default, returns every match detected in y
. This is the
-same behavior as SQL.
"any"
returns one match detected in y
, with no guarantees on which
-match will be returned. It is often faster than "first"
and "last"
-if you just need to detect if there is at least one match.
"first"
returns the first match detected in y
.
"last"
returns the last match detected in y
.
How should unmatched keys that would result in dropped rows -be handled?
"drop"
drops unmatched keys from the result.
"error"
throws an error if unmatched keys are detected.
unmatched
is intended to protect you from accidentally dropping rows
-during a join. It only checks for unmatched keys in the input that could
-potentially drop rows.
For left joins, it checks y
.
For right joins, it checks x
.
For inner joins, it checks both x
and y
. In this case, unmatched
is
-also allowed to be a character vector of length 2 to specify the behavior
-for x
and y
independently.
Handling of the expected relationship between the keys of
-x
and y
. If the expectations chosen from the list below are
-invalidated, an error is thrown.
NULL
, the default, doesn't expect there to be any relationship between
-x
and y
. However, for equality joins it will check for a many-to-many
-relationship (which is typically unexpected) and will warn if one occurs,
-encouraging you to either take a closer look at your inputs or make this
-relationship explicit by specifying "many-to-many"
.
See the Many-to-many relationships section for more details.
"one-to-one"
expects:
Each row in x
matches at most 1 row in y
.
Each row in y
matches at most 1 row in x
.
"one-to-many"
expects:
Each row in y
matches at most 1 row in x
.
"many-to-one"
expects:
Each row in x
matches at most 1 row in y
.
"many-to-many"
doesn't perform any relationship checks, but is provided
-to allow you to be explicit about this relationship if you know it
-exists.
relationship
doesn't handle cases where there are zero matches. For that,
-see unmatched
.
character: Vector of variable names in y
that will be
-kept after the merge. If TRUE (the default), it keeps all the brings all
-the variables in y into x. If FALSE or NULL, it does not bring any variable
-into x, but a report will be generated.
logical: If TRUE, it will update all values of variables
-in x with the actual of variables in y with the same name as the ones in x.
-NAs from y won't be used to update actual values in x. Yet, by default,
-NAs in x will be updated with values in y. To avoid this, make sure to set
-update_NAs = FALSE
logical: If TRUE, it will update NA values of all variables
-in x with actual values of variables in y that have the same name as the
-ones in x. If FALSE, NA values won't be updated, even if update_values
is
-TRUE
character: Name of reporting variable. Default is ".joyn". -This is the same as variable "_merge" in Stata after performing a merge. If -FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.
character: One of "character" or "numeric". Default is -"character". If "numeric", the reporting variable will contain numeric -codes of the source and the contents of each observation in the joined -table. See below for more information.
double: to be implemented
logical: If TRUE, it will keep the original variable -from y when both tables have common variable names. Thus, the prefix "y." -will be added to the original name to distinguish from the resulting -variable in the joined table.
logical: If TRUE, sort by key variables in by
. Default is
-FALSE.
logical: if FALSE, it won't display any message (programmer's -option). Default is TRUE.
Arguments passed on to joyn
match_type
character: one of "m:m", "m:1", "1:m", "1:1". -Default is "1:1" since this the most restrictive. However, following -Stata's recommendation, it is better to be explicit and use any of the -other three match types (See details in match types sections).
allow.cartesian
logical: Check documentation in official web site.
-Default is NULL
, which implies that if the join is "1:1" it will be
-FALSE
, but if the join has any "m" on it, it will be converted to TRUE
.
-By specifying TRUE
of FALSE
you force the behavior of the join.
suffixes
A character(2) specifying the suffixes to be used for making -non-by column names unique. The suffix behaviour works in a similar fashion -as the base::merge method does.
yvars
keep_y_in_x
msg_type
character: type of messages to display by default
na.last
logical
. If TRUE
, missing values in the data are placed last; if FALSE
, they are placed first; if NA
they are removed.
-na.last=NA
is valid only for x[order(., na.last)]
and its
-default is TRUE
. setorder
and setorderv
only accept
-TRUE
/FALSE
with default FALSE
.
An data frame of the same class as x
. The properties of the output
-are as close as possible to the ones returned by the dplyr alternative.
Other dplyr alternatives:
-anti_join()
,
-full_join()
,
-inner_join()
,
-left_join()
# Simple right join
-library(data.table)
-
-x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
- t = c(1L, 2L, 1L, 2L, NA_integer_),
- x = 11:15)
-y1 = data.table(id = c(1,2, 4),
- y = c(11L, 15L, 16))
-right_join(x1, y1, relationship = "many-to-one")
-#>
-#> ── JOYn Report ──
-#>
-#> .joyn n percent
-#> 1 x 2 50%
-#> 2 y 1 25%
-#> 3 x & y 1 25%
-#> 4 total 4 100%
-#> ────────────────────────────────────────────────────────── End of JOYn report ──
-#> ℹ Note: Joyn's report available in variable .joyn
-#> ℹ Note: Removing key variables id from id and y
-#> id t x y .joyn
-#> <num> <int> <int> <num> <fctr>
-#> 1: 1 1 11 11 x & y
-#> 2: 1 2 12 11 x & y
-#> 3: 2 1 13 15 x & y
-#> 4: 4 NA NA 16 y
-
R/dplyr-joins.R
- set_col_names.Rd
Add x key var and y key var (with suffixes) to x and y --when joining by different variables and keep is true
-This function is used to change the value of one or more joyn options
-pairs of option = value
environment, which is joyn environment by default
JOYn options functions
-get_joyn_options()
Split matching type (one of "1:1", "m:1", "1:m", "m:m"
) into its two components
Store joyn message to .joynenv environment
-combination of type and text in the form style1 = text1, style2 = text2
, etc.
Messages functions
-clear_joynenv()
,
-joyn_msg()
,
-joyn_msgs_exist()
,
-joyn_report()
,
-msg_type_dt()
,
-style()
,
-type_choices()
This is an adaptation from -https://github.com/r-lib/pkgbuild/blob/3ba537ab8a6ac07d3fe11c17543677d2a0786be6/R/styles.R
-combination of type and text in the form
-type1 = text1, type2 = text2
a character string to separate the terms to paste
Messages functions
-clear_joynenv()
,
-joyn_msg()
,
-joyn_msgs_exist()
,
-joyn_report()
,
-msg_type_dt()
,
-store_msg()
,
-type_choices()
Choice of messages
-Messages functions
-clear_joynenv()
,
-joyn_msg()
,
-joyn_msgs_exist()
,
-joyn_report()
,
-msg_type_dt()
,
-store_msg()
,
-style()
The function updates NAs and/or values in the following way:
If only update_NAs is TRUE: update NAs of var in x with values of var y of the same name
If only update_values = TRUE: update all values, but NOT NAs, of var in x with values of var y of the same name. -NAs from y are not used to update values in x . (e.g., if x.var = 10 and y.var = NA, x.var remains 10)
If both update_NAs and update_values are TRUE, both NAs and values in x are updated as described above
If both update_NAs and update_values are FALSE, no update
joined data.table
variable(s) to be updated
character: Name of reporting variable. Default is ".joyn". -This is the same as variable "_merge" in Stata after performing a merge. If -FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.
A character(2) specifying the suffixes to be used for making -non-by column names unique. The suffix behaviour works in a similar fashion -as the base::merge method does.
inherited from joyn update_NAs
inherited from joyn update_values