Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: assign codes for MIME types #159

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,29 @@ The first 127 bits are encoded as a single-byte varint, hence they are reserved
- [Ruby](https://github.com/sleeplessbyte/ruby-multicodec)
- [Add yours today!](https://github.com/multiformats/multicodec/edit/master/table.csv)

### Code Ranges

There are some reserved code ranges.

#### MIME Types

The range 0x200000 - 0x2fffff is reserved for MIME types. Specifically, we've
reserved:

```
Range 0x200000 - 0x20ffff: reserved for 'application/*' (there currently are ~1,300 subtypes)
Range 0x210000 - 0x21ffff: reserved for 'audio/*' (there currently are ~150 subtypes)
Range 0x220000 - 0x22ffff: reserved for 'font/*' (there currently are ~8 subtypes)
Range 0x230000 - 0x23ffff: reserved for 'image/*' (there currently are ~60 subtypes)
Range 0x240000 - 0x24ffff: reserved for 'message/*' (there currently are ~18 subtypes)
Range 0x250000 - 0x25ffff: reserved for 'model/*' (there currently are ~24 subtypes)
Range 0x260000 - 0x26ffff: reserved for 'multipart/*' (there currently are ~13 subtypes)
Range 0x270000 - 0x27ffff: reserved for 'text/*' (there currently are ~71 subtypes)
Range 0x280000 - 0x28ffff: reserved for 'video/*' (there currently are ~78 subtypes)
```

Everything from 0x290000 to 0x2fffff is reserved for future media types.

## FAQ

> Why varints?
Expand Down
138 changes: 138 additions & 0 deletions import-mime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python

import csv
import os
import urllib.request
import xml.etree.ElementTree as ET

sections = {
"application": 0x200000,
"audio": 0x210000,
"font": 0x220000,
"image": 0x230000,
"message": 0x240000,
"model": 0x250000,
"multipart": 0x260000,
"text": 0x270000,
"video": 0x280000,
}

ns = {'a': 'http://www.iana.org/assignments'}
source = "https://www.iana.org/assignments/media-types/media-types.xml"


class Table(list):
def __init__(self, fname='table.csv'):
self._fname = fname
with open(fname) as table:
self.extend(csv.reader(table, skipinitialspace=True))

def save(self):
widths = {}
for row in self:
for i, cell in enumerate(row):
if len(cell) > widths.get(i, 0):
widths[i] = len(cell)

formatted = ((("" if i == 0 else " " *
(1 + widths[i - 1] - len(row[i - 1]))) + cell
for i, cell in enumerate(row)) for row in self)

tmpfname = self._fname + ".tmp"
with open(tmpfname, 'w') as table:
writer = csv.writer(table)
writer.writerows(formatted)
os.rename(tmpfname, self._fname)


def formatCode(code: int) -> str:
nbytes = 0
if code == 0:
nbytes = 1
else:
remaining = code
while remaining > 0:
remaining >>= 7
nbytes += 1

return f"0x{code:0{nbytes*2}x}"


def main():
table = Table("table.csv")
lastCode = sections.copy()
assigned = {}
mimeStart = 0
mimeEnd = 0
for mimeStart, [_, tag, _, _] in enumerate(table[1:]):
if tag == "mimetype":
break
else:
mimeStart += 1

mimeStart += 1 # initial offset

for mimeEnd, [name, tag, code,
description] in enumerate(table[mimeStart:]):
if tag != "mimetype":
break

code = int(code, 16)

assigned[name] = (code, description)

parts = name.split('/')
section = parts[0]
if section not in sections:
raise RuntimeError(f"unknown mime base type {name}")
if len(parts) == 1:
continue
elif len(parts) != 2:
raise RuntimeError(f"invalid mimetype {name}")

subtype = parts[1]
lastCode[section] += 1
if code & 0xff0000 != sections[section]:
raise RuntimeError(f"wrong section for type")
if lastCode[section] != code:
raise RuntimeError(
f"expected code 0x{lastCode[section]:x}, got 0x{code:x}")
else:
mimeEnd += 1

mimeEnd += mimeStart # initial offset

for [_, tag, _, _] in table[mimeEnd:]:
if tag == "mimetype":
raise RuntimeError(
f"did not expect an mimetype out of the mime range")

with urllib.request.urlopen(source) as f:
root = ET.parse(f).getroot()

if root.get("id") != "media-types":
raise RuntimeError("expected root node to have id 'media-types'")

for mimetype in root.iterfind(
'./a:registry/a:record/a:file',
ns,
):
mimetype = mimetype.text
if mimetype in assigned:
continue
[section, subtype] = mimetype.split('/', 1)
code = lastCode[section] + 1
lastCode[section] = code
assigned[mimetype] = (code, "")

items = [(code, name, description)
for name, (code, description) in assigned.items()]
items.sort(key=lambda item: item[0])
table[mimeStart:mimeEnd] = [(name, "mimetype", formatCode(code),
description)
for (code, name, description) in items]
table.save()


if __name__ == "__main__":
main()
Loading