Rdatatable · ben-schwen · Jul 2, 2025 · Apr 6, 2022 · Apr 6, 2022 · Apr 6, 2022
@@ -70,6 +70,8 @@
 
 14. `data.table()` function is now more aligned with `data.frame()` with respect to the names of the output when one of its inputs is a single-column matrix object, [#4124](https://github.com/Rdatatable/data.table/issues/4124). Thanks @PavoDive for the report and @jangorecki for the PR.
 
+15. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
+
 ### NOTES
 
 1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.

@@ -1335,21 +1335,8 @@
         ans[icolsAns] = .Call(CsubsetDT, i, ii,    icols)
         ans[xcolsAns] = .Call(CsubsetDT, x, irows, xcols)
         setattr(ans, "names", ansvars)
-        if (haskey(x)) {
-          keylen = which.first(!key(x) %chin% ansvars)-1L
-          if (is.na(keylen)) keylen = length(key(x))
-          len = length(rightcols)
-          # fix for #1268, #1704, #1766 and #1823
-          chk = if (len && !missing(on)) !identical(head(key(x), len), names(on)) else FALSE
-          if ( (keylen>len || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
-            keylen = if (!chk) len else 0L # fix for #1268
-          }
-          ## check key on i as well!
-          ichk = is.data.table(i) && haskey(i) &&
-                 identical(head(key(i), length(leftcols)), names_i[leftcols]) # i has the correct key, #3061
-          if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
-            setattr(ans,"sorted",head(key(x),keylen))
-        }
+        # NB: could be NULL
+        setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll))
         setattr(ans, "class", class(x))  # retain class that inherits from data.table, #64
         setattr(ans, "row.names", .set_row_names(length(ans[[1L]])))
         setalloccol(ans)
@@ -2021,6 +2008,52 @@
   setalloccol(ans)   # TODO: overallocate in dogroups in the first place and remove this line
 }
 
+# can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL
+.join_result_key <- function(x, i, ans, on_lhs, ansvars, leftcols, rightcols, names_i, irows, roll) {
+  x_key <- key(x)
+  if (is.null(x_key))
+    return(NULL)
+
+  key_length = which.first(!x_key %chin% ansvars) - 1L
+  if (is.na(key_length))
+    key_length = length(x_key)
+
+  rhs_length = length(rightcols)
+  # fix for #1268, #1704, #1766 and #1823
+  chk = rhs_length && !is.null(on_lhs) && !identical(head(x_key, rhs_length), on_lhs)
+  if ( (key_length > rhs_length || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
+    key_length = if (chk) 0L else rhs_length # fix for #1268
+  }
+
+  if (!key_length)
+    return(NULL)
+
+  ## check key on i as well!
+  if (is.logical(i))
 if (!length(leftcols)) { 
   # basic x[i] subset, #2951 
   if (is.null(irows)) return(shallow(x))   # e.g. DT[TRUE] (#3214); otherwise CsubsetDT would materialize a deep copy 
   else                return(.Call(CsubsetDT, x, irows, seq_along(x)) ) 
   if (is.null(irows) && !is.null(shared_keys)) { 
     setattr(jval, 'sorted', shared_keys) 
     # potentially inefficient backup -- check if jval is sorted by key(x) 
   } else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) { 
     setattr(jval, 'sorted', key(x)) 
   } 
   if (any(vapply_1b(jval, is.null))) internal_error("j has created a data.table result containing a NULL column") # nocov 
 } 
 return(jval) 
 if (!length(leftcols)) { 
   # basic x[i] subset, #2951 
   if (is.null(irows)) return(shallow(x))   # e.g. DT[TRUE] (#3214); otherwise CsubsetDT would materialize a deep copy 
   else                return(.Call(CsubsetDT, x, irows, seq_along(x)) ) 
   if (is.null(irows) && !is.null(shared_keys)) { 
     setattr(jval, 'sorted', shared_keys) 
     # potentially inefficient backup -- check if jval is sorted by key(x) 
   } else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) { 
     setattr(jval, 'sorted', key(x)) 
   } 
   if (any(vapply_1b(jval, is.null))) internal_error("j has created a data.table result containing a NULL column") # nocov 
 } 
 return(jval) 
+    return(head(x_key, key_length))
+
+  # i has the correct key, #3061
+  if (identical(head(key(i), length(leftcols)), names_i[leftcols]))
+    return(head(x_key, key_length))
+
+  if (!.Call(CisOrderedSubset, irows, nrow(x)))
+    return(NULL)
+
+  # see #1010. don't set key when i has no key, but irows is ordered and !roll
+  if (roll && length(irows) != 1L)
+    return(NULL)
+
+  new_key <- head(x_key, key_length)
+
+  #5361 merging on keyed factor with character, check if resulting character is really sorted
+  if (identical(vapply_1c(.shallow(i, leftcols), typeof), vapply_1c(.shallow(x, rightcols), typeof)))
+    return(new_key)
+
+  if (!is.sorted(ans, by=new_key))
+    return(NULL)
+  new_key
+}
+
 # What's the name of the top-level call in 'j'?
 # NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026).
 root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else ""

@@ -7077,6 +7077,50 @@ test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US",
 setkey(y)
 test(1483.4, y[x], data.table(country="US", key="country"))
 
+# 5361 merge on character and factor should only have key(x) if result is really sorted
+lett = c("a", "b", "c")
+rlet = c("c", "b", "a")
+x = data.table(i=rlet)
+y = data.table(i=factor(lett, levels=rlet), key="i")
+test(1483.51, x[y, on="i"], x)
+test(1483.52, y[x, on="i"], x)
+test(1483.53, merge(x, y, by="i"), data.table(i=lett, key="i"))
+test(1483.54, merge(y, x, by="i"), data.table(i=lett, key="i"))
+x = data.table(i1=1:3, i2=rlet)
+y = data.table(i1=1:3, i2=factor(lett, levels=rlet), key=c("i1", "i2"))
+test(1483.55, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
+test(1483.56, y[x, on=c("i1", "i2")], x)
+test(1483.57, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
+test(1483.58, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
+
+x = data.table(i=rlet, key="i")
+y = data.table(i=factor(lett, levels=rlet))
+test(1483.61, x[y, on="i"], x)
+test(1483.62, y[x, on="i"], data.table(i=lett))
+test(1483.63, merge(x, y, by="i"), data.table(i=lett, key="i"))
+test(1483.64, merge(y, x, by="i"), data.table(i=lett, key="i"))
+x = data.table(i1=1:3, i2=rlet, key=c("i1", "i2"))
+y = data.table(i1=1:3, i2=factor(lett, levels=rlet))
+test(1483.65, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
+test(1483.66, y[x, on=c("i1", "i2")], data.table(i1=1:3, i2=rlet))
+test(1483.67, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
+test(1483.68, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
+
+x = data.table(i=rlet, a=rlet)
+y = data.table(i=factor(lett, levels=rlet), b=lett, key="i")
+test(1483.71, x[y, on="i"], data.table(i=rlet, a=rlet, b=rlet))
+test(1483.72, y[x, on="i"], data.table(i=rlet, b=rlet, a=rlet))
+test(1483.73, merge(x, y, by="i"), data.table(i=lett, a=lett, b=lett, key="i"))
+test(1483.74, merge(y, x, by="i"), data.table(i=lett, b=lett, a=lett, key="i"))
+
+some_letters <- c("c", "b", "a")
+some_more_letters <- rep(c("a", "b", "c"), 2L)
+dt1 <- data.table(x = some_letters, y=1:3)
+dt2 <- data.table(x = factor(some_more_letters, levels=some_letters), z=1:6, key=c("x", "z"))
+dt3 <- merge(dt1, dt2, by="x")
+test(1483.81, key(dt3), "x")
+test(1483.82, nrow(dt3[x %in% "c", ]), 2L)
+
 # NULL items should be removed when making data.table from list, #842
 # Original fix for #842 added a branch in as.data.table.list() using point()
 # Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch