Skip to content

Commit 0a0fbe9

Browse files
committed
doi: add new doi backend
Add a new backend to support mounting datasets published with a digital object identifier (DOI).
1 parent 0b96713 commit 0a0fbe9

13 files changed

+1383
-0
lines changed

backend/all/all.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
_ "github.com/rclone/rclone/backend/combine"
1515
_ "github.com/rclone/rclone/backend/compress"
1616
_ "github.com/rclone/rclone/backend/crypt"
17+
_ "github.com/rclone/rclone/backend/doi"
1718
_ "github.com/rclone/rclone/backend/drive"
1819
_ "github.com/rclone/rclone/backend/dropbox"
1920
_ "github.com/rclone/rclone/backend/fichier"

backend/doi/api/dataversetypes.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Type definitions specific to Dataverse
2+
3+
package api
4+
5+
// DataverseDatasetResponse is returned by the Dataverse dataset API
6+
type DataverseDatasetResponse struct {
7+
Status string `json:"status"`
8+
Data DataverseDataset `json:"data"`
9+
}
10+
11+
// DataverseDataset is the representation of a dataset
12+
type DataverseDataset struct {
13+
LatestVersion DataverseDatasetVersion `json:"latestVersion"`
14+
}
15+
16+
// DataverseDatasetVersion is the representation of a dataset version
17+
type DataverseDatasetVersion struct {
18+
LastUpdateTime string `json:"lastUpdateTime"`
19+
Files []DataverseFile `json:"files"`
20+
}
21+
22+
// DataverseFile is the representation of a file found in a dataset
23+
type DataverseFile struct {
24+
DirectoryLabel string `json:"directoryLabel"`
25+
DataFile DataverseDataFile `json:"dataFile"`
26+
}
27+
28+
// DataverseDataFile represents file metadata details
29+
type DataverseDataFile struct {
30+
ID int64 `json:"id"`
31+
Filename string `json:"filename"`
32+
ContentType string `json:"contentType"`
33+
FileSize int64 `json:"filesize"`
34+
OriginalFileFormat string `json:"originalFileFormat"`
35+
OriginalFileSize int64 `json:"originalFileSize"`
36+
OriginalFileName string `json:"originalFileName"`
37+
MD5 string `json:"md5"`
38+
}

backend/doi/api/inveniotypes.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// Type definitions specific to InvenioRDM
2+
3+
package api
4+
5+
// InvenioRecordResponse is the representation of a record stored in InvenioRDM
6+
type InvenioRecordResponse struct {
7+
Links InvenioRecordResponseLinks `json:"links"`
8+
// Metadata InvenioRecordMetadata `json:"metadata"`
9+
}
10+
11+
// InvenioRecordResponseLinks represents a record's links
12+
type InvenioRecordResponseLinks struct {
13+
Self string `json:"self"`
14+
}
15+
16+
// InvenioFilesResponse is the representation of a record's files
17+
type InvenioFilesResponse struct {
18+
Entries []InvenioFilesResponseEntry `json:"entries"`
19+
}
20+
21+
// InvenioFilesResponseEntry is the representation of a file entry
22+
type InvenioFilesResponseEntry struct {
23+
Key string `json:"key"`
24+
Checksum string `json:"checksum"`
25+
Size int64 `json:"size"`
26+
Updated string `json:"updated"`
27+
MimeType string `json:"mimetype"`
28+
Links InvenioFilesResponseEntryLinks `json:"links"`
29+
}
30+
31+
// InvenioFilesResponseEntryLinks represents file links details
32+
type InvenioFilesResponseEntryLinks struct {
33+
Content string `json:"content"`
34+
}

backend/doi/api/types.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Package api has general type definitions for doi
2+
package api
3+
4+
// DoiResolverResponse is returned by the DOI resolver API
5+
//
6+
// Reference: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
7+
type DoiResolverResponse struct {
8+
ResponseCode int `json:"responseCode"`
9+
Handle string `json:"handle"`
10+
Values []DoiResolverResponseValue `json:"values"`
11+
}
12+
13+
// DoiResolverResponseValue is a single handle record value
14+
type DoiResolverResponseValue struct {
15+
Index int `json:"index"`
16+
Type string `json:"type"`
17+
Data DoiResolverResponseValueData `json:"data"`
18+
TTL int `json:"ttl"`
19+
Timestamp string `json:"timestamp"`
20+
}
21+
22+
// DoiResolverResponseValueData is the data held in a handle value
23+
type DoiResolverResponseValueData struct {
24+
Format string `json:"format"`
25+
Value any `json:"value"`
26+
}

backend/doi/dataverse.go

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// Implementation for Dataverse
2+
3+
package doi
4+
5+
import (
6+
"context"
7+
"fmt"
8+
"net/http"
9+
"net/url"
10+
"path"
11+
"strings"
12+
"time"
13+
14+
"github.com/rclone/rclone/backend/doi/api"
15+
"github.com/rclone/rclone/fs"
16+
"github.com/rclone/rclone/lib/rest"
17+
)
18+
19+
// Returns true if resolvedURL is likely a DOI hosted on a Dataverse intallation
20+
func activateDataverse(resolvedURL *url.URL) (isActive bool) {
21+
queryValues := resolvedURL.Query()
22+
persistentID := queryValues.Get("persistentId")
23+
return persistentID != ""
24+
}
25+
26+
// Resolve the main API endpoint for a DOI hosted on a Dataverse installation
27+
func resolveDataverseEndpoint(resolvedURL *url.URL) (provider Provider, endpoint *url.URL, err error) {
28+
queryValues := resolvedURL.Query()
29+
persistentID := queryValues.Get("persistentId")
30+
31+
query := url.Values{}
32+
query.Add("persistentId", persistentID)
33+
endpointURL := resolvedURL.ResolveReference(&url.URL{Path: "/api/datasets/:persistentId/", RawQuery: query.Encode()})
34+
35+
return Dataverse, endpointURL, nil
36+
}
37+
38+
// Implements Fs.List() for Dataverse installations
39+
func (f *Fs) listDataverse(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
40+
fileEntries, err := f.listDataverseDoiFiles(ctx)
41+
if err != nil {
42+
return nil, fmt.Errorf("error listing %q: %w", dir, err)
43+
}
44+
45+
fullDir := path.Join(f.root, dir)
46+
if fullDir != "" {
47+
fullDir = fullDir + "/"
48+
}
49+
dirPaths := map[string]bool{}
50+
for _, entry := range fileEntries {
51+
// First, filter out files not in `fullDir`
52+
if !strings.HasPrefix(entry.remote, fullDir) {
53+
continue
54+
}
55+
// Then, find entries in subfolers
56+
remotePath := entry.remote
57+
if fullDir != "" {
58+
remotePath = strings.TrimLeft(strings.TrimPrefix(remotePath, fullDir), "/")
59+
}
60+
parts := strings.SplitN(remotePath, "/", 2)
61+
if len(parts) == 1 {
62+
newEntry := *entry
63+
newEntry.remote = path.Join(dir, remotePath)
64+
entries = append(entries, &newEntry)
65+
} else {
66+
dirPaths[path.Join(dir, parts[0])] = true
67+
}
68+
}
69+
for dirPath := range dirPaths {
70+
entry := fs.NewDir(dirPath, time.Time{})
71+
entries = append(entries, entry)
72+
}
73+
return entries, nil
74+
}
75+
76+
// List the files contained in the DOI
77+
func (f *Fs) listDataverseDoiFiles(ctx context.Context) (entries []*Object, err error) {
78+
// Use the cache if populated
79+
cachedEntries, found := f.cache.GetMaybe("files")
80+
if found {
81+
parsedEntries, ok := cachedEntries.([]Object)
82+
if ok {
83+
for _, entry := range parsedEntries {
84+
newEntry := entry
85+
entries = append(entries, &newEntry)
86+
}
87+
return entries, nil
88+
}
89+
}
90+
91+
filesURL := f.endpoint
92+
var res *http.Response
93+
var result api.DataverseDatasetResponse
94+
opts := rest.Opts{
95+
Method: "GET",
96+
Path: strings.TrimLeft(filesURL.EscapedPath(), "/"),
97+
Parameters: filesURL.Query(),
98+
}
99+
err = f.pacer.Call(func() (bool, error) {
100+
res, err = f.srv.CallJSON(ctx, &opts, nil, &result)
101+
return shouldRetry(ctx, res, err)
102+
})
103+
if err != nil {
104+
return nil, fmt.Errorf("readDir failed: %w", err)
105+
}
106+
modTime, modTimeErr := time.Parse(time.RFC3339, result.Data.LatestVersion.LastUpdateTime)
107+
if modTimeErr != nil {
108+
fs.Logf(f, "error: could not parse last update time %v", modTimeErr)
109+
modTime = timeUnset
110+
}
111+
for _, file := range result.Data.LatestVersion.Files {
112+
contentURLPath := fmt.Sprintf("/api/access/datafile/%d", file.DataFile.ID)
113+
query := url.Values{}
114+
query.Add("format", "original")
115+
contentURL := f.endpoint.ResolveReference(&url.URL{Path: contentURLPath, RawQuery: query.Encode()})
116+
entry := &Object{
117+
fs: f,
118+
remote: path.Join(file.DirectoryLabel, file.DataFile.Filename),
119+
contentURL: contentURL.String(),
120+
size: file.DataFile.FileSize,
121+
modTime: modTime,
122+
md5: file.DataFile.MD5,
123+
contentType: file.DataFile.ContentType,
124+
}
125+
if file.DataFile.OriginalFileName != "" {
126+
entry.remote = path.Join(file.DirectoryLabel, file.DataFile.OriginalFileName)
127+
entry.size = file.DataFile.OriginalFileSize
128+
entry.contentType = file.DataFile.OriginalFileFormat
129+
}
130+
entries = append(entries, entry)
131+
}
132+
// Populate the cache
133+
cacheEntries := []Object{}
134+
for _, entry := range entries {
135+
cacheEntries = append(cacheEntries, *entry)
136+
}
137+
f.cache.Put("files", cacheEntries)
138+
return entries, nil
139+
}

0 commit comments

Comments
 (0)