Skip to content

Commit 40e3a8b

Browse files
authored
Pefer FileSystemProvider plugins for http/ftp over legacy handlers(#1693)
* Re-Ordering priority in SeekableStreamFactory so FileSystemProvider plugins will be preferred over the built in http/ftp handlers. * Now when constructing streams it will prefer NIO plugins if they are available. * If no http(s) / ftp plugin exists it will fall back to the htsjdk built in. * Similarly updating ParsingUtils.openInputStream() and exists() * Update HtsPath to throw on non file schemes with malformed URIs instead of trying to interpret them as file:// * Deprecating SeekableStreamFactory.isFilePath() since it is no longer used and interacts poorly with nio filesystem providers * Note previously, unencoded FTP paths were allowed. Now FTP paths with spaces must be percent encoded.
1 parent 3964abe commit 40e3a8b

11 files changed

+214
-97
lines changed

src/main/java/htsjdk/io/HtsPath.java

+24
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,10 @@ private URI getURIForString(final String pathString) {
253253
tempURI = getCachedPath().toUri();
254254
}
255255
} catch (URISyntaxException uriException) {
256+
//check that the uri wasn't a badly encoded absolute uri of some sort
257+
//if you don't do this it will be treated as a badly formed file:// url
258+
assertNoNonFileScheme(pathString, uriException);
259+
256260
// the input string isn't a valid URI; assume its a local (non-URI) file reference, and
257261
// use the URI resulting from the corresponding Path
258262
try {
@@ -276,5 +280,25 @@ private URI getURIForString(final String pathString) {
276280

277281
return tempURI;
278282
}
283+
284+
/**
285+
* check that there isn't a non file scheme at the start of the path
286+
* @param pathString
287+
* @param cause
288+
*/
289+
private static void assertNoNonFileScheme(String pathString, URISyntaxException cause){
290+
final String[] split = pathString.split(":");
291+
if(split.length > 1){
292+
if(split[0] == null || split[0].isEmpty()){
293+
throw new IllegalArgumentException("Malformed url " + pathString + " includes an empty scheme." +
294+
"\nCheck that it is fully encoded.", cause);
295+
}
296+
if(!split[0].equals("file")){
297+
throw new IllegalArgumentException("Malformed url " + pathString + " includes a scheme: " + split[0] + ":// but was an invalid URI." +
298+
"\nCheck that it is fully encoded.", cause);
299+
}
300+
}
301+
302+
}
279303

280304
}

src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java

+1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ public class SAMRecordSetBuilder implements Iterable<SAMRecord> {
5454
"chr21", "chr22", "chrX", "chrY", "chrM"
5555
};
5656

57+
5758
private static final String READ_GROUP_ID = "1";
5859
private static final String SAMPLE = "FREE_SAMPLE";
5960
private final Random random = new Random(TestUtil.RANDOM_SEED);

src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java

+58-24
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,15 @@
2323
*/
2424
package htsjdk.samtools.seekablestream;
2525

26-
import htsjdk.samtools.util.IOUtil;
27-
import java.io.File;
26+
import htsjdk.io.HtsPath;
27+
import htsjdk.io.IOPath;
28+
import htsjdk.tribble.TribbleException;
29+
2830
import java.io.IOException;
29-
import java.net.URI;
3031
import java.net.URL;
3132
import java.nio.channels.SeekableByteChannel;
33+
import java.nio.file.Path;
34+
import java.util.Set;
3235
import java.util.function.Function;
3336

3437
/**
@@ -40,6 +43,14 @@
4043
public class SeekableStreamFactory{
4144

4245
private static final ISeekableStreamFactory DEFAULT_FACTORY;
46+
private static final String HTTP = "http";
47+
private static final String HTTPS = "https";
48+
private static final String FTP = "ftp";
49+
/**
50+
* the set of url schemes that have special support in htsjdk that isn't through a FileSystemProvider
51+
*/
52+
private static final Set<String> URL_SCHEMES_WITH_LEGACY_SUPPORT = Set.of(HTTP, FTP, HTTPS);
53+
public static final String FILE_SCHEME = "file";
4354
private static ISeekableStreamFactory currentFactory;
4455

4556
static{
@@ -61,9 +72,28 @@ public static ISeekableStreamFactory getInstance(){
6172
* Does this path point to a regular file on disk and not something like a URL?
6273
* @param path the path to test
6374
* @return true if the path is to a file on disk
75+
* @deprecated this method is simplistic and no longer particularly useful since IOPath allows similar access to
76+
* various non-file data sources, internal use has been replaced with {@link #isBeingHandledByLegacyUrlSupport(String)}
6477
*/
78+
@Deprecated
6579
public static boolean isFilePath(final String path) {
66-
return ! ( path.startsWith("http:") || path.startsWith("https:") || path.startsWith("ftp:") );
80+
return !canBeHandledByLegacyUrlSupport(path);
81+
}
82+
83+
/**
84+
* is this path being handled by one of the legacy SeekableStream types (http(s) / ftp)
85+
*
86+
* @param path a path to check
87+
* @return if the path is not being handled by a FileSystemProvider and it can be read by legacy streams
88+
*/
89+
public static boolean isBeingHandledByLegacyUrlSupport(final String path){
90+
return !new HtsPath(path).hasFileSystemProvider() //if we have a provider for it that's what we'll use
91+
&& canBeHandledByLegacyUrlSupport(path); // otherwise we fall back to the special handlers
92+
}
93+
94+
//is this one of the url types that has legacy htsjdk support built in?
95+
public static boolean canBeHandledByLegacyUrlSupport(final String path) {
96+
return URL_SCHEMES_WITH_LEGACY_SUPPORT.stream().anyMatch(scheme-> path.startsWith(scheme +"://"));
6797
}
6898

6999
private static class DefaultSeekableStreamFactory implements ISeekableStreamFactory {
@@ -79,7 +109,7 @@ public SeekableStream getStreamFor(final String path) throws IOException {
79109
}
80110

81111
/**
82-
* The wrapper will only be applied to the stream if the stream is treated as a {@link java.nio.file.Path}
112+
* The wrapper will only be applied to the stream if the stream is treated as a {@link Path}
83113
*
84114
* This currently means any uri with a scheme that is not http, https, ftp, or file will have the wrapper applied to it
85115
*
@@ -89,26 +119,30 @@ public SeekableStream getStreamFor(final String path) throws IOException {
89119
@Override
90120
public SeekableStream getStreamFor(final String path,
91121
Function<SeekableByteChannel, SeekableByteChannel> wrapper) throws IOException {
92-
// todo -- add support for SeekableBlockInputStream
93-
94-
if (path.startsWith("http:") || path.startsWith("https:")) {
95-
final URL url = new URL(path);
96-
return new SeekableHTTPStream(url);
97-
} else if (path.startsWith("ftp:")) {
98-
return new SeekableFTPStream(new URL(path));
99-
} else if (path.startsWith("file:")) {
100-
try {
101-
// convert to URI in order to obtain a decoded version of the path string suitable
102-
// for use with the File constructor
103-
final String decodedPath = new URI(path).getPath();
104-
return new SeekableFileStream(new File(decodedPath));
105-
} catch (java.net.URISyntaxException e) {
106-
throw new IllegalArgumentException(String.format("The input string %s contains a URI scheme but is not a valid URI", path), e);
107-
}
108-
} else if (IOUtil.hasScheme(path)) {
109-
return new SeekablePathStream(IOUtil.getPath(path), wrapper);
122+
return getStreamFor(new HtsPath(path), wrapper);
123+
}
124+
125+
126+
/**
127+
* The wrapper will only be applied to the stream if the stream is treated as a non file:// {@link Path}
128+
*
129+
* This has a fall back to htsjdk's built in http and ftp providers if no FileSystemProvder is available for them
130+
*
131+
* @param path an IOPath to be opened
132+
* @param wrapper a wrapper to apply to the stream allowing direct transformations on the byte stream to be applied
133+
* @throws IOException
134+
*/
135+
public static SeekableStream getStreamFor(final IOPath path, Function<SeekableByteChannel, SeekableByteChannel> wrapper) throws IOException {
136+
if(path.hasFileSystemProvider()) {
137+
return path.getScheme().equals(FILE_SCHEME)
138+
? new SeekableFileStream(path.toPath().toFile()) //don't apply the wrapper to local files
139+
: new SeekablePathStream(path.toPath(), wrapper);
110140
} else {
111-
return new SeekableFileStream(new File(path));
141+
return switch(path.getScheme()){
142+
case HTTP, HTTPS -> new SeekableHTTPStream(new URL(path.getRawInputString()));
143+
case FTP -> new SeekableFTPStream((new URL(path.getRawInputString())));
144+
default -> throw new TribbleException("Unknown path type. No FileSystemProvider available for " + path.getRawInputString());
145+
};
112146
}
113147
}
114148

src/main/java/htsjdk/tribble/FeatureCodec.java

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
package htsjdk.tribble;
2020

21+
import htsjdk.io.IOPath;
2122
import htsjdk.samtools.util.LocationAware;
2223
import htsjdk.tribble.index.tabix.TabixFormat;
2324

src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java

+7-6
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
*/
2424
package htsjdk.tribble;
2525

26+
import htsjdk.io.HtsPath;
2627
import htsjdk.samtools.seekablestream.SeekableStream;
2728
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
2829
import htsjdk.samtools.util.IOUtil;
@@ -40,6 +41,7 @@
4041
import java.net.URI;
4142
import java.net.URLEncoder;
4243
import java.nio.channels.SeekableByteChannel;
44+
import java.nio.charset.StandardCharsets;
4345
import java.util.ArrayList;
4446
import java.util.Iterator;
4547
import java.util.List;
@@ -60,9 +62,9 @@ public class TribbleIndexedFeatureReader<T extends Feature, SOURCE> extends Abst
6062
private Index index;
6163

6264
/**
63-
* is the path pointing to our source data a regular file?
65+
* is the path backed by old style built in http(s) / ftp support instead of a FileSystemProvider
6466
*/
65-
private final boolean pathIsRegularFile;
67+
private final boolean pathIsOldStyleHttpOrFtp;
6668

6769
/**
6870
* a potentially reusable seekable stream for queries over regular files
@@ -97,8 +99,7 @@ public TribbleIndexedFeatureReader(final String featurePath, final FeatureCodec<
9799
}
98100
}
99101

100-
// does path point to a regular file?
101-
this.pathIsRegularFile = SeekableStreamFactory.isFilePath(path);
102+
this.pathIsOldStyleHttpOrFtp = SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(path);
102103

103104
readHeader();
104105
}
@@ -203,7 +204,7 @@ private SeekableStream getSeekableStream() throws IOException {
203204
* @return true if
204205
*/
205206
private boolean reuseStreamInQuery() {
206-
return pathIsRegularFile;
207+
return !pathIsOldStyleHttpOrFtp;
207208
}
208209

209210
@Override
@@ -252,7 +253,7 @@ private void readHeader() throws IOException {
252253
PositionalBufferedStream pbs = null;
253254
try {
254255
is = ParsingUtils.openInputStream(path, wrapper);
255-
if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8")))) {
256+
if (IOUtil.hasBlockCompressedExtension(new HtsPath(path).getURI())) {
256257
// TODO: TEST/FIX THIS! https://github.com/samtools/htsjdk/issues/944
257258
// TODO -- warning I don't think this can work, the buffered input stream screws up position
258259
is = new GZIPInputStream(new BufferedInputStream(is));

src/main/java/htsjdk/tribble/util/FTPHelper.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import java.io.IOException;
88
import java.io.InputStream;
9+
import java.net.URISyntaxException;
910
import java.net.URL;
1011

1112
/**
@@ -35,7 +36,12 @@ public long getContentLength() throws IOException {
3536

3637
@Override
3738
public InputStream openInputStream() throws IOException {
38-
String file = url.getPath();
39+
String file = null;
40+
try {
41+
file = url.toURI().getPath();
42+
} catch (URISyntaxException e) {
43+
throw new IOException(e);
44+
}
3945
FTPClient ftp = FTPUtils.connect(url.getHost(), url.getUserInfo(), null);
4046
ftp.pasv();
4147
ftp.retr(file);

src/main/java/htsjdk/tribble/util/ParsingUtils.java

+25-20
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,20 @@
2323
*/
2424
package htsjdk.tribble.util;
2525

26+
import htsjdk.io.HtsPath;
27+
import htsjdk.io.IOPath;
2628
import htsjdk.samtools.seekablestream.SeekablePathStream;
29+
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
2730
import htsjdk.samtools.util.IOUtil;
31+
2832
import java.awt.Color;
2933
import java.io.File;
30-
import java.io.FileInputStream;
3134
import java.io.IOException;
3235
import java.io.InputStream;
33-
import java.lang.reflect.Constructor;
3436
import java.net.MalformedURLException;
3537
import java.net.URI;
3638
import java.net.URL;
39+
import java.net.URLEncoder;
3740
import java.nio.channels.SeekableByteChannel;
3841
import java.nio.file.Files;
3942
import java.util.*;
@@ -49,7 +52,7 @@ public class ParsingUtils {
4952
private static URLHelperFactory urlHelperFactory = RemoteURLHelper::new;
5053

5154
// HTML 4.1 color table, + orange and magenta
52-
private static Map<String, String> colorSymbols = new HashMap();
55+
private static final Map<String, String> colorSymbols = new HashMap<>();
5356

5457
static {
5558
colorSymbols.put("white", "FFFFFF");
@@ -81,32 +84,35 @@ public static InputStream openInputStream(String path)
8184
return openInputStream(path, null);
8285
}
8386

84-
static private final Set<String> URL_SCHEMES = new HashSet<>(Arrays.asList("http", "ftp", "https"));
85-
8687
/**
8788
* open an input stream from the given path and wrap the raw byte stream with a wrapper if given
8889
*
89-
* the wrapper will only be applied to paths that are not http, https, ftp, or file, i.e. any {@link java.nio.file.Path}
90-
* using a custom filesystem plugin
90+
* the wrapper will only be applied to paths that are
91+
* 1. not local files
92+
* 2. not being handled by the legacy http(s)/ftp providers
93+
* i.e. any {@link java.nio.file.Path} using a custom FileSystem plugin
9194
* @param uri a uri like string
9295
* @param wrapper to wrap the input stream in, may be used to implement caching or prefetching, etc
9396
* @return An inputStream appropriately created from uri and conditionally wrapped with wrapper (only in certain cases)
9497
* @throws IOException when stream cannot be opened against uri
9598
*/
9699
public static InputStream openInputStream(final String uri, final Function<SeekableByteChannel, SeekableByteChannel> wrapper)
97100
throws IOException {
98-
99-
final InputStream inputStream;
100-
101-
if (URL_SCHEMES.stream().anyMatch(uri::startsWith)) {
102-
inputStream = getURLHelper(new URL(uri)).openInputStream();
103-
} else if (!IOUtil.hasScheme(uri)) {
104-
File file = new File(uri);
105-
inputStream = Files.newInputStream(file.toPath());
101+
final IOPath path = new HtsPath(uri);
102+
if(path.hasFileSystemProvider()){
103+
if(path.isPath()) {
104+
return path.getScheme().equals("file")
105+
? Files.newInputStream(path.toPath())
106+
: new SeekablePathStream(path.toPath(), wrapper);
107+
} else {
108+
throw new IOException("FileSystemProvider for path " + path.getRawInputString() + " exits but failed to " +
109+
" create path. \n" + path.getToPathFailureReason());
110+
}
111+
} else if( SeekableStreamFactory.canBeHandledByLegacyUrlSupport(uri)){
112+
return getURLHelper(new URL(uri)).openInputStream();
106113
} else {
107-
inputStream = new SeekablePathStream(IOUtil.getPath(uri), wrapper);
114+
throw new IOException("No FileSystemProvider available to handle path: " + path.getRawInputString());
108115
}
109-
return inputStream;
110116
}
111117

112118
public static <T> String join(String separator, Collection<T> objects) {
@@ -402,10 +408,9 @@ private static Color hexToColor(String string) {
402408
}
403409

404410
public static boolean resourceExists(String resource) throws IOException{
405-
406-
boolean remoteFile = resource.startsWith("http://") || resource.startsWith("https://") || resource.startsWith("ftp://");
411+
boolean remoteFile = SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(resource);
407412
if (remoteFile) {
408-
URL url = null;
413+
URL url;
409414
try {
410415
url = new URL(resource);
411416
} catch (MalformedURLException e) {

src/test/java/htsjdk/io/HtsPathUnitTest.java

+3
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ public Object[][] validHtsPath() {
9292
{"gcs://abucket/bucket", "gcs://abucket/bucket", false, false},
9393
{"gendb://somegdb", "gendb://somegdb", false, false},
9494
{"chr1:1-100", "chr1:1-100", false, false},
95+
{"ftp://broad.org/file", "ftp://broad.org/file", false, false},
96+
{"ftp://broad.org/with%20space", "ftp://broad.org/with%20space", false, false},
9597

9698
//**********************************************************************************************
9799
// Valid URIs which ARE valid NIO URIs (there *IS* an installed file system provider), but are
@@ -167,6 +169,7 @@ public Object[][] invalidHtsPath() {
167169
// the nul character is rejected on all of the supported platforms in both local
168170
// filenames and URIs, so use it to test HtsPath constructor failure on all platforms
169171
{"\0"},
172+
{"ftp://broad.org/file with space"} // this has a non-file scheme but isn't encoded properly
170173
};
171174
}
172175

0 commit comments

Comments
 (0)