Skip to content
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
34a744e
add fix
ben-schwen Apr 6, 2022
80b0b91
add test
ben-schwen Apr 6, 2022
1f6c41e
add more tests
ben-schwen Apr 6, 2022
4ab82c1
add NEWS
ben-schwen Apr 6, 2022
5a3e3c7
typo
MichaelChirico Apr 7, 2022
8d70e6d
sentence structure
MichaelChirico Apr 7, 2022
b1f4892
state bug more precisely
ben-schwen Apr 7, 2022
2183cbb
unwield lengthy if with ws
ben-schwen Apr 7, 2022
f42d498
more tests
ben-schwen Apr 7, 2022
96437fb
extend NEWS for mirror casse
ben-schwen Apr 7, 2022
f45564a
update NEWS
ben-schwen Jun 26, 2022
9ce6737
vapply on single columns instead of whole subset
ben-schwen Sep 9, 2024
b7bbb5a
Merge branch 'master' into merge_factor_char_key
ben-schwen Sep 9, 2024
608599c
use .shallow
ben-schwen Sep 9, 2024
fedde2d
Merge branch 'master' into merge_factor_char_key
MichaelChirico Jun 30, 2025
0d5f5a5
suggested NEWS wording
MichaelChirico Jun 30, 2025
c204850
add OPs original example more exactly to the regression test
MichaelChirico Jun 30, 2025
6750e27
add some tests of multiple join columns
MichaelChirico Jun 30, 2025
49a8f1a
avoid using column x in table x
MichaelChirico Jun 30, 2025
c9a33af
attempt to refactor into huge helper (🤞)
MichaelChirico Jun 30, 2025
07e3acc
trailing ws
MichaelChirico Jun 30, 2025
fa3be64
fix (?) tests
MichaelChirico Jun 30, 2025
dc95644
need to pass 'ans' too
MichaelChirico Jun 30, 2025
5ad17ad
don't reuse overloaded name 'let'
MichaelChirico Jun 30, 2025
7c8d2f7
typo
MichaelChirico Jun 30, 2025
088f876
remove apparently vestigial check
MichaelChirico Jun 30, 2025
dda8722
Merge remote-tracking branch 'origin/master' into merge_factor_char_key
MichaelChirico Jun 30, 2025
a1cbe53
Merge branch 'master' into merge_factor_char_key
ben-schwen Jul 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@

14. `data.table()` function is now more aligned with `data.frame()` with respect to the names of the output when one of its inputs is a single-column matrix object, [#4124](https://github.com/Rdatatable/data.table/issues/4124). Thanks @PavoDive for the report and @jangorecki for the PR.

15. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.

### NOTES

1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.
Expand Down
63 changes: 48 additions & 15 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1335,21 +1335,8 @@
ans[icolsAns] = .Call(CsubsetDT, i, ii, icols)
ans[xcolsAns] = .Call(CsubsetDT, x, irows, xcols)
setattr(ans, "names", ansvars)
if (haskey(x)) {
keylen = which.first(!key(x) %chin% ansvars)-1L
if (is.na(keylen)) keylen = length(key(x))
len = length(rightcols)
# fix for #1268, #1704, #1766 and #1823
chk = if (len && !missing(on)) !identical(head(key(x), len), names(on)) else FALSE
if ( (keylen>len || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
keylen = if (!chk) len else 0L # fix for #1268
}
## check key on i as well!
ichk = is.data.table(i) && haskey(i) &&
identical(head(key(i), length(leftcols)), names_i[leftcols]) # i has the correct key, #3061
if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
setattr(ans,"sorted",head(key(x),keylen))
}
# NB: could be NULL
setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll))
setattr(ans, "class", class(x)) # retain class that inherits from data.table, #64
setattr(ans, "row.names", .set_row_names(length(ans[[1L]])))
setalloccol(ans)
Expand Down Expand Up @@ -2021,6 +2008,52 @@
setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line
}

# can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL
.join_result_key <- function(x, i, ans, on_lhs, ansvars, leftcols, rightcols, names_i, irows, roll) {
x_key <- key(x)
if (is.null(x_key))
return(NULL)

key_length = which.first(!x_key %chin% ansvars) - 1L
if (is.na(key_length))
key_length = length(x_key)

rhs_length = length(rightcols)
# fix for #1268, #1704, #1766 and #1823
chk = rhs_length && !is.null(on_lhs) && !identical(head(x_key, rhs_length), on_lhs)
if ( (key_length > rhs_length || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
key_length = if (chk) 0L else rhs_length # fix for #1268
}

if (!key_length)
return(NULL)

## check key on i as well!
if (is.logical(i))
Copy link
Member

@MichaelChirico MichaelChirico Jun 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is.logical(i) check has been there since initial check-in in 2008 (2ec50ec; was is.logical(irows) then), not sure it's possible to reach it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For queries like DT[<logical subset>], we return here:

data.table/R/data.table.R

Lines 681 to 684 in 6029f2f

if (!length(leftcols)) {
# basic x[i] subset, #2951
if (is.null(irows)) return(shallow(x)) # e.g. DT[TRUE] (#3214); otherwise CsubsetDT would materialize a deep copy
else return(.Call(CsubsetDT, x, irows, seq_along(x)) )

For queries like DT[<logical subset>, .(key, other)], the key is retained & the value returned here:

data.table/R/data.table.R

Lines 1459 to 1467 in 6029f2f

if (is.null(irows) && !is.null(shared_keys)) {
setattr(jval, 'sorted', shared_keys)
# potentially inefficient backup -- check if jval is sorted by key(x)
} else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) {
setattr(jval, 'sorted', key(x))
}
if (any(vapply_1b(jval, is.null))) internal_error("j has created a data.table result containing a NULL column") # nocov
}
return(jval)

So I'm pretty sure it's not possible to reach this. We can see if revdeps turn anything up.

return(head(x_key, key_length))

Check warning on line 2033 in R/data.table.R

View check run for this annotation

Codecov / codecov/patch

R/data.table.R#L2033

Added line #L2033 was not covered by tests

# i has the correct key, #3061
if (identical(head(key(i), length(leftcols)), names_i[leftcols]))
return(head(x_key, key_length))

if (!.Call(CisOrderedSubset, irows, nrow(x)))
return(NULL)

# see #1010. don't set key when i has no key, but irows is ordered and !roll
if (roll && length(irows) != 1L)
return(NULL)

new_key <- head(x_key, key_length)

#5361 merging on keyed factor with character, check if resulting character is really sorted
if (identical(vapply_1c(.shallow(i, leftcols), typeof), vapply_1c(.shallow(x, rightcols), typeof)))
return(new_key)

if (!is.sorted(ans, by=new_key))
return(NULL)
new_key
}

# What's the name of the top-level call in 'j'?
# NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026).
root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else ""
Expand Down
44 changes: 44 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -7077,6 +7077,50 @@ test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US",
setkey(y)
test(1483.4, y[x], data.table(country="US", key="country"))

# 5361 merge on character and factor should only have key(x) if result is really sorted
lett = c("a", "b", "c")
rlet = c("c", "b", "a")
x = data.table(i=rlet)
y = data.table(i=factor(lett, levels=rlet), key="i")
test(1483.51, x[y, on="i"], x)
test(1483.52, y[x, on="i"], x)
test(1483.53, merge(x, y, by="i"), data.table(i=lett, key="i"))
test(1483.54, merge(y, x, by="i"), data.table(i=lett, key="i"))
x = data.table(i1=1:3, i2=rlet)
y = data.table(i1=1:3, i2=factor(lett, levels=rlet), key=c("i1", "i2"))
test(1483.55, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
test(1483.56, y[x, on=c("i1", "i2")], x)
test(1483.57, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
test(1483.58, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))

x = data.table(i=rlet, key="i")
y = data.table(i=factor(lett, levels=rlet))
test(1483.61, x[y, on="i"], x)
test(1483.62, y[x, on="i"], data.table(i=lett))
test(1483.63, merge(x, y, by="i"), data.table(i=lett, key="i"))
test(1483.64, merge(y, x, by="i"), data.table(i=lett, key="i"))
x = data.table(i1=1:3, i2=rlet, key=c("i1", "i2"))
y = data.table(i1=1:3, i2=factor(lett, levels=rlet))
test(1483.65, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
test(1483.66, y[x, on=c("i1", "i2")], data.table(i1=1:3, i2=rlet))
test(1483.67, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
test(1483.68, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))

x = data.table(i=rlet, a=rlet)
y = data.table(i=factor(lett, levels=rlet), b=lett, key="i")
test(1483.71, x[y, on="i"], data.table(i=rlet, a=rlet, b=rlet))
test(1483.72, y[x, on="i"], data.table(i=rlet, b=rlet, a=rlet))
test(1483.73, merge(x, y, by="i"), data.table(i=lett, a=lett, b=lett, key="i"))
test(1483.74, merge(y, x, by="i"), data.table(i=lett, b=lett, a=lett, key="i"))

some_letters <- c("c", "b", "a")
some_more_letters <- rep(c("a", "b", "c"), 2L)
dt1 <- data.table(x = some_letters, y=1:3)
dt2 <- data.table(x = factor(some_more_letters, levels=some_letters), z=1:6, key=c("x", "z"))
dt3 <- merge(dt1, dt2, by="x")
test(1483.81, key(dt3), "x")
test(1483.82, nrow(dt3[x %in% "c", ]), 2L)

# NULL items should be removed when making data.table from list, #842
# Original fix for #842 added a branch in as.data.table.list() using point()
# Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch
Expand Down
Loading