multiformats · Stebalien · Jan 31, 2020 · Jan 31, 2020 · Feb 1, 2020 · Feb 1, 2020
@@ -58,6 +58,29 @@ The first 127 bits are encoded as a single-byte varint, hence they are reserved
 - [Ruby](https://github.com/sleeplessbyte/ruby-multicodec)
 - [Add yours today!](https://github.com/multiformats/multicodec/edit/master/table.csv)
 
+### Code Ranges
+
+There are some reserved code ranges.
+
+#### MIME Types
+
+The range 0x200000 - 0x2fffff is reserved for MIME types. Specifically, we've
+reserved:
+
+```
+Range 0x200000 - 0x20ffff: reserved for 'application/*' (there currently are ~1,300 subtypes)
+Range 0x210000 - 0x21ffff: reserved for 'audio/*' (there currently are ~150 subtypes)
+Range 0x220000 - 0x22ffff: reserved for 'font/*' (there currently are ~8 subtypes)
+Range 0x230000 - 0x23ffff: reserved for 'image/*' (there currently are ~60 subtypes)
+Range 0x240000 - 0x24ffff: reserved for 'message/*' (there currently are ~18 subtypes)
+Range 0x250000 - 0x25ffff: reserved for 'model/*' (there currently are ~24 subtypes)
+Range 0x260000 - 0x26ffff: reserved for 'multipart/*' (there currently are ~13 subtypes)
+Range 0x270000 - 0x27ffff: reserved for 'text/*' (there currently are ~71 subtypes)
+Range 0x280000 - 0x28ffff: reserved for 'video/*' (there currently are ~78 subtypes)
+```
+
+Everything from 0x290000 to 0x2fffff is reserved for future media types.
+
 ## FAQ
 
 > Why varints?

@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+
+import csv
+import os
+import urllib.request
+import xml.etree.ElementTree as ET
+
+sections = {
+    "application": 0x200000,
+    "audio": 0x210000,
+    "font": 0x220000,
+    "image": 0x230000,
+    "message": 0x240000,
+    "model": 0x250000,
+    "multipart": 0x260000,
+    "text": 0x270000,
+    "video": 0x280000,
+}
+
+ns = {'a': 'http://www.iana.org/assignments'}
+source = "https://www.iana.org/assignments/media-types/media-types.xml"
+
+
+class Table(list):
+    def __init__(self, fname='table.csv'):
+        self._fname = fname
+        with open(fname) as table:
+            self.extend(csv.reader(table, skipinitialspace=True))
+
+    def save(self):
+        widths = {}
+        for row in self:
+            for i, cell in enumerate(row):
+                if len(cell) > widths.get(i, 0):
+                    widths[i] = len(cell)
+
+        formatted = ((("" if i == 0 else " " *
+                       (1 + widths[i - 1] - len(row[i - 1]))) + cell
+                      for i, cell in enumerate(row)) for row in self)
+
+        tmpfname = self._fname + ".tmp"
+        with open(tmpfname, 'w') as table:
+            writer = csv.writer(table)
+            writer.writerows(formatted)
+        os.rename(tmpfname, self._fname)
+
+
+def formatCode(code: int) -> str:
+    nbytes = 0
+    if code == 0:
+        nbytes = 1
+    else:
+        remaining = code
+        while remaining > 0:
+            remaining >>= 7
+            nbytes += 1
+
+    return f"0x{code:0{nbytes*2}x}"
+
+
+def main():
+    table = Table("table.csv")
+    lastCode = sections.copy()
+    assigned = {}
+    mimeStart = 0
+    mimeEnd = 0
+    for mimeStart, [_, tag, _, _] in enumerate(table[1:]):
+        if tag == "mimetype":
+            break
+    else:
+        mimeStart += 1
+
+    mimeStart += 1  # initial offset
+
+    for mimeEnd, [name, tag, code,
+                  description] in enumerate(table[mimeStart:]):
+        if tag != "mimetype":
+            break
+
+        code = int(code, 16)
+
+        assigned[name] = (code, description)
+
+        parts = name.split('/')
+        section = parts[0]
+        if section not in sections:
+            raise RuntimeError(f"unknown mime base type {name}")
+        if len(parts) == 1:
+            continue
+        elif len(parts) != 2:
+            raise RuntimeError(f"invalid mimetype {name}")
+
+        subtype = parts[1]
+        lastCode[section] += 1
+        if code & 0xff0000 != sections[section]:
+            raise RuntimeError(f"wrong section for type")
+        if lastCode[section] != code:
+            raise RuntimeError(
+                f"expected code 0x{lastCode[section]:x}, got 0x{code:x}")
+    else:
+        mimeEnd += 1
+
+    mimeEnd += mimeStart  # initial offset
+
+    for [_, tag, _, _] in table[mimeEnd:]:
+        if tag == "mimetype":
+            raise RuntimeError(
+                f"did not expect an mimetype out of the mime range")
+
+    with urllib.request.urlopen(source) as f:
+        root = ET.parse(f).getroot()
+
+    if root.get("id") != "media-types":
+        raise RuntimeError("expected root node to have id 'media-types'")
+
+    for mimetype in root.iterfind(
+            './a:registry/a:record/a:file',
+            ns,
+    ):
+        mimetype = mimetype.text
+        if mimetype in assigned:
+            continue
+        [section, subtype] = mimetype.split('/', 1)
+        code = lastCode[section] + 1
+        lastCode[section] = code
+        assigned[mimetype] = (code, "")
+
+    items = [(code, name, description)
+             for name, (code, description) in assigned.items()]
+    items.sort(key=lambda item: item[0])
+    table[mimeStart:mimeEnd] = [(name, "mimetype", formatCode(code),
+                                 description)
+                                for (code, name, description) in items]
+    table.save()
+
+
+if __name__ == "__main__":
+    main()