Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 45 additions & 6 deletions ext/bert/c/decode.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "ruby.h"
#include "ruby/encoding.h"
#include <stdint.h>
#include <netinet/in.h>

Expand All @@ -14,9 +15,12 @@
#define ERL_BIN 109
#define ERL_SMALL_BIGNUM 110
#define ERL_LARGE_BIGNUM 111
#define ERL_ENC_STRING 112
#define ERL_UNICODE_STRING 113

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC this is the first time we're diverging from erlang's External Term Format. And as said format does have these values (for NEW_FUN_EXT and EXPORT_EXT resp.), I think a comment explaining that we knowingly diverge here for version 2 would be useful.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there other (available) values we should use instead?

Adherence to erlang formats isn't important for GitHub's uses of BERT -- which all involve Ruby -- but I could imagine it being an issue if anyone else is still using BERT with erlang.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming the online docs are up to date, numbers from 120 onwards are free, but they could probably add new types at any point.

We could reduce the divergence by using 107 (STRING_EXT) for utf8 strings whose length fits within 2 bytes and go with ENC_STRING if they're bigger, but we'd still re-use or hope we never find new types useful for ENC_STRING.

The types we're making unavailable aren't something that I think make sense to transfer between ruby and erlang, so I don't think anybody will miss them. But even so making an note of the rationale in the code would be useful.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I chose these values because they are used for the offset in the callbacks struct. Would adding a comment and removing the ERL prefix be enough?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The use in the offset makes it a bit trickier. How about we use ERLEXT_ and have a comment saying these are specific to our own v2? My concern here is coming back to the code and having to figure out how the C, ruby and spec match up so leaving a note would be polite :)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I added a comment! :D

#define ERL_VERSION 131
#define ERL_VERSION2 132

#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERL_LARGE_BIGNUM)
#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERL_UNICODE_STRING)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The validity would depend on the protocol version, no? If I say I'm using v1 but send ENC_STRING should we consider it invalid? It looks like we'll accept v2 regardless of the header, which may or may not be a big deal.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

¯_(ツ)_/¯ I can make the decoder complain if you use a v2 value in a v1 payload. But that means that I'll have to teach bert_read about the type it's parsing. I can set that on the bert_buf *but value, it just means more code.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably not worth the bother, tbh. Let's leave it as you have it here.

#define BERT_TYPE_OFFSET (ERL_SMALL_INT)

static VALUE rb_mBERT;
Expand All @@ -40,6 +44,8 @@ static VALUE bert_read_nil(struct bert_buf *buf);
static VALUE bert_read_string(struct bert_buf *buf);
static VALUE bert_read_list(struct bert_buf *buf);
static VALUE bert_read_bin(struct bert_buf *buf);
static VALUE bert_read_enc_string(struct bert_buf *buf);
static VALUE bert_read_unicode_string(struct bert_buf *buf);
static VALUE bert_read_sbignum(struct bert_buf *buf);
static VALUE bert_read_lbignum(struct bert_buf *buf);

Expand All @@ -59,7 +65,9 @@ static bert_ptr bert_callbacks[] = {
&bert_read_list,
&bert_read_bin,
&bert_read_sbignum,
&bert_read_lbignum
&bert_read_lbignum,
&bert_read_enc_string,
&bert_read_unicode_string
};

static inline uint8_t bert_buf_read8(struct bert_buf *buf)
Expand Down Expand Up @@ -293,6 +301,34 @@ static VALUE bert_read_bin(struct bert_buf *buf)
return rb_bin;
}

static VALUE bert_read_unicode_string(struct bert_buf *buf)
{
VALUE rb_str;

rb_str = bert_read_bin(buf);
rb_enc_associate(rb_str, rb_utf8_encoding());

return rb_str;
}

static VALUE bert_read_enc_string(struct bert_buf *buf)
{
uint8_t type;
VALUE rb_bin, enc;

rb_bin = bert_read_bin(buf);

bert_buf_ensure(buf, 1);
type = bert_buf_read8(buf);
if (ERL_BIN != type)
rb_raise(rb_eRuntimeError, "Invalid tag '%d' for term", type);

enc = bert_read_bin(buf);
rb_enc_associate(rb_bin, rb_find_encoding(enc));

return rb_bin;
}

static VALUE bert_read_string(struct bert_buf *buf)
{
uint16_t i, length;
Expand Down Expand Up @@ -467,17 +503,20 @@ static VALUE bert_read_invalid(struct bert_buf *buf)
static VALUE rb_bert_decode(VALUE klass, VALUE rb_string)
{
struct bert_buf buf;
uint8_t proto_version;

Check_Type(rb_string, T_STRING);
buf.data = (uint8_t *)RSTRING_PTR(rb_string);
buf.end = buf.data + RSTRING_LEN(rb_string);

bert_buf_ensure(&buf, 1);

if (bert_buf_read8(&buf) != ERL_VERSION)
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");

return bert_read(&buf);
proto_version = bert_buf_read8(&buf);
if (proto_version == ERL_VERSION || proto_version == ERL_VERSION2) {
return bert_read(&buf);
} else {
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");
}
}

static VALUE rb_bert_impl(VALUE klass)
Expand Down
2 changes: 1 addition & 1 deletion lib/bert.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
# Global method for specifying that an array should be encoded as a tuple.
def t
BERT::Tuple
end
end
32 changes: 30 additions & 2 deletions lib/bert/decode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@ def self.impl
def self.decode(string)
io = StringIO.new(string)
io.set_encoding('binary') if io.respond_to?(:set_encoding)
new(io).read_any
header = io.getbyte
case header
when MAGIC, VERSION_2
new(io).read_any
else
fail("Bad Magic")
end
end

def initialize(ins)
Expand All @@ -19,7 +25,6 @@ def initialize(ins)
end

def read_any
fail("Bad Magic") unless read_1 == MAGIC
read_any_raw
end

Expand All @@ -37,6 +42,8 @@ def read_any_raw
when STRING then read_erl_string
when LIST then read_list
when BIN then read_bin
when ENC_STRING then read_enc_string
when UNICODE_STRING then read_unicode_string
else
fail("Unknown term tag: #{peek_1}")
end
Expand Down Expand Up @@ -223,6 +230,14 @@ def read_nil
[]
end

def read_unicode_string
fail("Invalid Type, not a unicode string") unless read_1 == UNICODE_STRING
length = read_4
str = read_string(length)
str.force_encoding "UTF-8"
str
end

def read_erl_string
fail("Invalid Type, not an erlang string") unless read_1 == STRING
length = read_2
Expand All @@ -246,5 +261,18 @@ def read_bin
def fail(str)
raise str
end

private

def read_enc_string
fail("Invalid Type, not an erlang binary") unless read_1 == ENC_STRING
length = read_4
x = read_string(length)

fail("Invalid Type, not an erlang binary") unless read_1 == BIN
length = read_4
x.force_encoding read_string(length)
x
end
end
end
55 changes: 53 additions & 2 deletions lib/bert/encode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,47 @@ module BERT
class Encode
include Types

class V2 < Encode
def write_binary(data)
enc = data.encoding
case enc
when ::Encoding::UTF_8, ::Encoding::US_ASCII
write_unicode_string data
when ::Encoding::ASCII_8BIT
super
else
write_enc_string data
end
end

private

def write_unicode_string(data)
write_1 UNICODE_STRING
write_4 data.bytesize
write_string data
end

def write_enc_string(data)
write_1 ENC_STRING
write_4 data.bytesize
write_string data
enc = data.encoding.name
write_1 BIN
write_4 enc.bytesize
write_string enc
end

def version_header
VERSION_2
end
end

class << self
attr_accessor :version
end
self.version = :v1

attr_accessor :out

def initialize(out)
Expand All @@ -11,12 +52,18 @@ def initialize(out)
def self.encode(data)
io = StringIO.new
io.set_encoding('binary') if io.respond_to?(:set_encoding)
self.new(io).write_any(data)

if version == :v2
Encode::V2.new(io).write_any(data)
else
new(io).write_any(data)
end

io.string
end

def write_any obj
write_1 MAGIC
write_1 version_header
write_any_raw obj
end

Expand Down Expand Up @@ -132,6 +179,10 @@ def write_binary(data)

private

def version_header
MAGIC
end

def fail(obj)
raise "Cannot encode to erlang external format: #{obj.inspect}"
end
Expand Down
6 changes: 4 additions & 2 deletions lib/bert/types.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ module Types
STRING = 107
LIST = 108
BIN = 109
ENC_STRING = 112
UNICODE_STRING = 113
FUN = 117
NEW_FUN = 112
MAGIC = 131
VERSION_2 = 132
MAX_INT = (1 << 27) -1
MIN_INT = -(1 << 27)
end
end
end
49 changes: 42 additions & 7 deletions test/bert_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,55 @@ class BertTest < Test::Unit::TestCase
setup do
time = Time.at(1254976067)
@ruby = t[:user, {:name => 'TPW'}, [/cat/i, 9.9], time, nil, true, false, :true, :false]
@bert = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false"
@ebin = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
@bert_old = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false".b
@ebin_old = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
end

should "encode" do
assert_equal @bert, BERT.encode(@ruby)
context "v2 encoder" do
setup do
@old_version = BERT::Encode.version
BERT::Encode.version = :v2
@bert = "\x84h\td\x00\x04userh\x03d\x00\x04bertd\x00\x04dictl\x00\x00\x00\x01h\x02d\x00\x04nameq\x00\x00\x00\x03TPWjl\x00\x00\x00\x02h\x04d\x00\x04bertd\x00\x05regexq\x00\x00\x00\x03catl\x00\x00\x00\x01d\x00\bcaselessjc9.900000000000000e+00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jh\x05d\x00\x04bertd\x00\x04timeb\x00\x00\x04\xE6b\x00\x0E\xE4\xC3a\x00h\x02d\x00\x04bertd\x00\x03nilh\x02d\x00\x04bertd\x00\x04trueh\x02d\x00\x04bertd\x00\x05falsed\x00\x04trued\x00\x05false".b
@ebin = "<<132,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,113,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,113,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
end

teardown do
BERT::Encode.version = @old_version
end

should "decode new format" do
assert_equal @ruby, BERT.decode(@bert)
end

should "roundtrip string and maintain encoding" do
str = "日本語".encode 'EUC-JP'
round = BERT.decode(BERT.encode(str))
assert_equal str, round
assert_equal str.encoding, round.encoding
end

should "roundtrip binary string" do
str = "日本語".b
round = BERT.decode(BERT.encode(str))
assert_equal str, round
assert_equal str.encoding, round.encoding
end

should "encode" do
assert_equal @bert, BERT.encode(@ruby)
end

should "ebin" do
assert_equal @ebin, BERT.ebin(@bert)
end
end

should "decode" do
assert_equal @ruby, BERT.decode(@bert)
should "decode the old format" do
assert_equal @ruby, BERT.decode(@bert_old)
end

should "ebin" do
assert_equal @ebin, BERT.ebin(@bert)
assert_equal @ebin_old, BERT.ebin(@bert_old)
end

should "do roundtrips" do
Expand Down
31 changes: 31 additions & 0 deletions test/encoder_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class EncoderTest < Test::Unit::TestCase
end

should 'handle utf8 strings' do
str = "été".encode 'UTF-8'
bert = [131, 109, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode("été")
end
Expand All @@ -99,6 +100,36 @@ class EncoderTest < Test::Unit::TestCase
assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000)
end

context "v2" do
setup do
@old_version = BERT::Encode.version
BERT::Encode.version = :v2
end

teardown do
BERT::Encode.version = @old_version
end

should 'handle utf8 strings' do
str = "été".encode 'UTF-8'
bert = [132, 113, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode("été")
end

should 'handle utf8 symbols' do
bert = [132, 100, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode(:'été')
end

should "handle bignums" do
bert = [132,110,8,0,0,0,232,137,4,35,199,138].pack('c*')
assert_equal bert, BERT::Encoder.encode(10_000_000_000_000_000_000)

bert = [132,110,8,1,0,0,232,137,4,35,199,138].pack('c*')
assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000)
end
end

should "leave other stuff alone" do
before = [1, 2.0, [:foo, 'bar']]
assert_equal before, BERT::Encoder.convert(before)
Expand Down