diff --git a/src/dev_codec_httpsig_siginfo.erl b/src/dev_codec_httpsig_siginfo.erl index 493f9421e..7ec445088 100644 --- a/src/dev_codec_httpsig_siginfo.erl +++ b/src/dev_codec_httpsig_siginfo.erl @@ -27,12 +27,14 @@ commitments_to_siginfo(_Msg, Comms, _Opts) when ?IS_EMPTY_MESSAGE(Comms) -> #{}; commitments_to_siginfo(Msg, Comms, Opts) -> - % Generate a SF item for each commitment's signature and signature-input. + % Emit a SF item per commitment. `CommID' is threaded through so + % `commitment_to_sf_siginfo/4' can add an `id' parameter whenever the + % decoder-side derivation would not reproduce the sender's map key. {Sigs, SigInputs} = maps:fold( - fun(_CommID, Commitment, {Sigs, SigInputs}) -> + fun(CommID, Commitment, {Sigs, SigInputs}) -> {ok, SigNameRaw, SFSig, SFSigInput} = - commitment_to_sf_siginfo(Msg, Commitment, Opts), + commitment_to_sf_siginfo(Msg, CommID, Commitment, Opts), SigName = <<"comm-", SigNameRaw/binary>>, { Sigs#{ SigName => SFSig }, @@ -51,12 +53,13 @@ commitments_to_siginfo(Msg, Comms, Opts) -> %% @doc Generate a `signature' and `signature-input' key pair from a given %% commitment. -commitment_to_sf_siginfo(Msg, Commitment, Opts) -> +commitment_to_sf_siginfo(Msg, CommID, Commitment, Opts) -> % Generate the `alg' key from the commitment. Alg = commitment_to_alg(Commitment, Opts), % Find the public key from the commitment, which we will use as the - % `keyid' in the `signature-input' keys. - KeyID = maps:get(<<"keyid">>, Commitment, <<>>), + % `keyid' in the `signature-input' keys. Absent in the commitment => + % absent on the wire (permitted by RFC 9421 §1.4.2.3). + KeyID = maps:get(<<"keyid">>, Commitment, undefined), % Extract the signature from the commitment. Signature = hb_util:decode(maps:get(<<"signature">>, Commitment)), % Extract the keys present in the commitment. @@ -70,12 +73,28 @@ commitment_to_sf_siginfo(Msg, Commitment, Opts) -> Expires = maps:get(<<"expires">>, Commitment, undefined), % Generate the name of the signature. SigName = hb_util:to_lower(hb_util:human_id(crypto:hash(sha256, Signature))), - % Generate the signature input and signature structured-fields. These can + % If the decoder's derivation would not reproduce the sender's map key, + % transport it explicitly as an `id' parameter. Content-addressed devices + % (e.g. `~ipfs@1.0') key on a CID that is not a function of `Sig'; HMAC, + % RSA-PSS, and other `h(Sig)'-keyed devices never pay this cost. + DerivedID = derived_commitment_id(Signature), + IDParam = + case CommID of + undefined -> []; + DerivedID -> []; + _ -> [{<<"id">>, {string, CommID}}] + end, + % Generate the signature input and signature structured-fields. These can % then be placed into a dictionary with other commitments and transformed % into their binary representations. SFSig = {item, {binary, Signature}, []}, AdditionalParams = get_additional_params(Commitment), - Params = + KeyIDItem = + case KeyID of + undefined -> undefined; + _ -> {string, KeyID} + end, + Params = lists:filter( fun({_Key, undefined}) -> false; @@ -84,12 +103,12 @@ commitment_to_sf_siginfo(Msg, Commitment, Opts) -> end, [ {<<"alg">>, {string, Alg}}, - {<<"keyid">>, {string, KeyID}}, + {<<"keyid">>, KeyIDItem}, {<<"tag">>, {string, Tag}}, {<<"created">>, Created}, {<<"expires">>, Expires}, {<<"nonce">>, {string, Nonce}} - ] ++ AdditionalParams + ] ++ IDParam ++ AdditionalParams ), SFSigInput = {list, @@ -113,11 +132,19 @@ commitment_to_sf_siginfo(Msg, Commitment, Opts) -> ), {ok, SigName, SFSig, SFSigInput}. +%% @doc Default commitment ID derivation used on both encode and decode +%% when no explicit `id' parameter is present. 32-byte sigs are used +%% directly; longer sigs are rehashed with sha-256. +derived_commitment_id(Sig) when byte_size(Sig) == 32 -> + hb_util:human_id(Sig); +derived_commitment_id(Sig) -> + hb_util:human_id(crypto:hash(sha256, Sig)). + get_additional_params(Commitment) -> AdditionalParams = sets:to_list( sets:subtract( - sets:from_list(maps:keys(Commitment)), + sets:from_list(maps:keys(Commitment)), sets:from_list( [ <<"alg">>, @@ -129,6 +156,7 @@ get_additional_params(Commitment) -> <<"committed">>, <<"signature">>, <<"type">>, + <<"id">>, <<"commitment-device">>, <<"committer">> ] @@ -248,34 +276,32 @@ sf_siginfo_to_commitment(Msg, BodyKeys, SFSig, SFSigInput, Opts) -> {item, {string, Key}, []} <- SigInput ], CommittedKeys = from_siginfo_keys(Msg, BodyKeys, RawCommittedKeys), - % Merge and cleanup the output. - % 1. Decode the `keyid` (typically a public key) to its raw byte form. - % 2. Decode the `signature` to its raw byte form. - % 3. Filter undefined keys. - % 4. Generate the ID for the commitment from the signature. We use a SHA2-256 - % hash of the signature, unless the signature is 32 bytes, in which case we - % use the signature directly as the ID. - % 5. If the `keyid' is a public key (determined by length >= 32 bytes), set - % the `committer' to its hash. + % Merge and cleanup the output: + % 1. Decode `keyid' and `signature' to raw bytes. + % 2. Filter undefined keys. + % 3. Use the transported `id' parameter when present (content-addressed + % devices), otherwise fall back to `derived_commitment_id/1'. + % 4. If the `keyid' resolves to a public key, set the `committer'. Commitment3 = Commitment2#{ <<"signature">> => hb_util:encode(Sig), <<"committed">> => CommittedKeys }, - KeyID = maps:get(<<"keyid">>, Commitment3, <<>>), + {ID, Commitment4} = + case maps:take(<<"id">>, Commitment3) of + {ExplicitID, Stripped} -> {ExplicitID, Stripped}; + error -> {derived_commitment_id(Sig), Commitment3} + end, + KeyID = maps:get(<<"keyid">>, Commitment4, <<>>), Commitment5 = case dev_codec_httpsig_keyid:keyid_to_committer(KeyID) of undefined -> - Commitment3; + Commitment4; Committer -> - Commitment3#{ + Commitment4#{ <<"committer">> => Committer } end, - ID = - if byte_size(Sig) == 32 -> hb_util:human_id(Sig); - true -> hb_util:human_id(crypto:hash(sha256, Sig)) - end, % Return the commitment and calculated ID. {ok, ID, Commitment5}. diff --git a/src/dev_codec_ipfs.erl b/src/dev_codec_ipfs.erl new file mode 100644 index 000000000..c3b8cfcb6 --- /dev/null +++ b/src/dev_codec_ipfs.erl @@ -0,0 +1,286 @@ +%%% @doc `~ipfs@1.0': a codec and commitment device whose commitment IDs are +%%% IPFS CIDv1s over a message's `body'. In codec mode, encodes TABMs to +%%% deterministic dag-cbor and back, routed through `~structured@1.0' the +%%% same way `dev_codec_json' and `dev_codec_flat' do. The `body''s CID is +%%% produced by `dev_codec_ipfs_cid:encode/3'; `hb_cache' then links the +%%% CID to the message's uncommitted ID automatically. +-module(dev_codec_ipfs). +-export([info/1, commit/3, verify/3, content_type/1]). +-export([to/3, from/3]). +-include("include/hb.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +-define(DEVICE_NAME, <<"ipfs@1.0">>). +%% Native commitment types combine the multihash and the CID multicodec so +%% they slot into the wire `alg' field as `ipfs@1.0/' via +%% `dev_codec_httpsig_siginfo:commitment_to_alg/2' — no custom RFC 9421 +%% metadata parameters. +-define(DEFAULT_TYPE, <<"sha2-256-raw">>). +-define(COMMITTED_KEYS, [<<"body">>]). +-define(IS_NATIVE_TYPE(T), + (T =:= <<"sha2-256-raw">> orelse T =:= <<"sha2-256-dag-cbor">>)). + +%% @doc Restrict AO-Core resolution to the codec/commitment surface. +info(_) -> + #{ exports => [commit, verify, content_type, to, from] }. + +%% @doc Return the IPLD MIME type for a commitment's native `type'. +content_type(#{ <<"type">> := <<"sha2-256-dag-cbor">> }) -> + {ok, <<"application/vnd.ipld.dag-cbor">>}; +content_type(_) -> + {ok, <<"application/vnd.ipld.raw">>}. + +%% @doc Attach a CIDv1 commitment over `body'. `type: unsigned' is the +%% generic caller knob — translate it to the codec's native type. Any other +%% commit type (signed, rsa-pss, etc.) delegates to `~httpsig@1.0', the +%% composition pattern used by `dev_codec_flat' and `dev_codec_json'. +commit(Msg, Req = #{ <<"type">> := <<"unsigned">> }, Opts) -> + Native = hb_maps:get(<<"hash-alg">>, Req, ?DEFAULT_TYPE, Opts), + Req1 = hb_maps:without([<<"hash-alg">>], Req, Opts), + commit(Msg, Req1#{ <<"type">> => Native }, Opts); +commit(Msg, #{ <<"type">> := Type }, Opts) when ?IS_NATIVE_TYPE(Type) -> + Body = hb_maps:get(<<"body">>, Msg, <<>>, Opts), + Multicodec = multicodec_of(Type), + CID = dev_codec_ipfs_cid:encode(Multicodec, sha2_256, Body), + Commitment = #{ + <<"commitment-device">> => ?DEVICE_NAME, + <<"type">> => Type, + <<"committed">> => ?COMMITTED_KEYS, + %% Carrying the raw sha-256 digest as `signature' keeps the + %% commitment on the httpsig wire (see + %% `dev_codec_httpsig_siginfo''s signature filter). No `keyid' — + %% content-addressed commitments need no key material. RFC 9421 + %% §1.4.2.3 permits keyid's absence. + <<"signature">> => hb_util:encode(crypto:hash(sha256, Body)) + }, + Existing = hb_maps:get(<<"commitments">>, Msg, #{}, Opts), + ?event(ipfs, {commit, {cid, CID}, {type, Type}, {size, byte_size(Body)}}), + {ok, Msg#{ <<"commitments">> => Existing#{ CID => Commitment } }}; +commit(_Msg, #{ <<"type">> := <<"sha2-256-", _/binary>> = Type }, _Opts) -> + {error, {unsupported_type, Type}}; +commit(Msg, Req, Opts) -> + dev_codec_httpsig:commit(Msg, Req, Opts). + +%% @doc Verify an `~ipfs@1.0' commitment by recomputing the CID from `body' +%% under the declared native type and checking it keys the commitments map. +verify(Base, #{ <<"type">> := Type }, Opts) when ?IS_NATIVE_TYPE(Type) -> + Body = hb_maps:get(<<"body">>, Base, <<>>, Opts), + Comms = hb_maps:get(<<"commitments">>, Base, #{}, Opts), + Expected = dev_codec_ipfs_cid:encode(multicodec_of(Type), sha2_256, Body), + Res = hb_maps:is_key(Expected, Comms, Opts), + ?event(ipfs, {verify, {type, Type}, {expected, Expected}, {result, Res}}), + {ok, Res}; +verify(Base, Req, Opts) -> + dev_codec_httpsig:verify(Base, Req, Opts). + +%% @doc Resolve a native `type' to its CID multicodec name. +multicodec_of(<<"sha2-256-raw">>) -> <<"raw">>; +multicodec_of(<<"sha2-256-dag-cbor">>) -> <<"dag-cbor">>. + +%% @doc Serialize a TABM to deterministic dag-cbor bytes. Routes through +%% `~structured@1.0' to recover native types, resolves links (dag-cbor is +%% self-contained), strips `priv', and walks the result into the IPLD +%% intermediate form that `dev_codec_ipfs_cbor:encode/1' consumes. +to(Bin, _Req, _Opts) when is_binary(Bin) -> + %% Bare binaries encode as text strings (or byte strings if not UTF-8) + %% so that `to' / `from' is a roundtrip. + try {ok, dev_codec_ipfs_cbor:encode(Bin)} + catch throw:{dag_cbor_encode, {invalid_utf8, _}} -> + {ok, dev_codec_ipfs_cbor:encode({bytes, Bin})} + end; +to(Msg, _Req, Opts) when is_map(Msg) -> + try + Structured = + hb_message:convert( + hb_private:reset(Msg), + <<"structured@1.0">>, + tabm, + Opts + ), + Loaded = hb_cache:ensure_all_loaded(Structured, Opts), + Clean = hb_maps:without([<<"priv">>], Loaded, Opts), + {ok, dev_codec_ipfs_cbor:encode(structured_to_ipld(Clean))} + catch throw:{dag_cbor_encode, Reason} -> + ?event(warning, {ipfs_to_failed, Reason}), + {error, {dag_cbor_encode, Reason}} + end. + +%% @doc Walk a structured HyperBEAM value into the IPLD intermediate form. +%% Atoms outside `null/true/false' have no IPLD representation and throw. +structured_to_ipld(null) -> null; +structured_to_ipld(true) -> true; +structured_to_ipld(false) -> false; +structured_to_ipld(A) when is_atom(A) -> + throw({dag_cbor_encode, {unsupported_atom, A}}); +structured_to_ipld(N) when is_integer(N); is_float(N) -> N; +structured_to_ipld(B) when is_binary(B) -> B; +structured_to_ipld(L) when is_list(L) -> + [ structured_to_ipld(V) || V <- L ]; +structured_to_ipld(M) when is_map(M) -> + maps:from_list( + [ + {assert_binary_key(K), structured_to_ipld(V)} + || + {K, V} <- maps:to_list(M) + ] + ); +structured_to_ipld(V) -> + throw({dag_cbor_encode, {unsupported_value, V}}). + +assert_binary_key(K) when is_binary(K) -> K; +assert_binary_key(K) -> throw({dag_cbor_encode, {non_binary_map_key, K}}). + +%% @doc Parse dag-cbor bytes into a TABM. Pre-decoded maps pass through +%% unchanged, matching the `dev_codec_json' / `dev_codec_flat' discipline. +from(Map, _Req, _Opts) when is_map(Map) -> + {ok, Map}; +from(Bin, Req, Opts) when is_binary(Bin) -> + case dev_codec_ipfs_cbor:decode(Bin) of + {ok, Ipld} -> + case ipld_to_structured(Ipld) of + M when is_map(M) -> dev_codec_structured:from(M, Req, Opts); + Other -> {ok, Other} + end; + {error, Reason} -> + ?event(warning, {ipfs_from_failed, Reason}), + {error, {dag_cbor_decode, Reason}} + end. + +%% @doc Walk the IPLD intermediate form into a rich-typed HyperBEAM value. +%% `{bytes, B}' flattens to a binary; `{link, CID}' flattens to the CID +%% string — a link-aware mapping through `hb_link' is future work. +ipld_to_structured(null) -> null; +ipld_to_structured(true) -> true; +ipld_to_structured(false) -> false; +ipld_to_structured(N) when is_integer(N); is_float(N) -> N; +ipld_to_structured(B) when is_binary(B) -> B; +ipld_to_structured({bytes, B}) -> B; +ipld_to_structured({link, CID}) -> CID; +ipld_to_structured(L) when is_list(L) -> + [ ipld_to_structured(V) || V <- L ]; +ipld_to_structured(M) when is_map(M) -> + maps:map(fun(_K, V) -> ipld_to_structured(V) end, M). + +%%% Tests. Integration-level tests live in `dev_codec_ipfs_test'. + +commit_unsigned_raw_attaches_cid_test() -> + {ok, Committed} = + commit( + #{ <<"body">> => <<"hello world">> }, + #{ <<"type">> => <<"unsigned">> }, + #{} + ), + [CID] = maps:keys(maps:get(<<"commitments">>, Committed)), + ?assertEqual( + <<"bafkreifzjut3te2nhyekklss27nh3k72ysco7y32koao5eei66wof36n5e">>, + CID + ), + Commitment = maps:get(CID, maps:get(<<"commitments">>, Committed)), + ?assertEqual(?DEVICE_NAME, maps:get(<<"commitment-device">>, Commitment)), + ?assertEqual(<<"sha2-256-raw">>, maps:get(<<"type">>, Commitment)), + ?assertEqual([<<"body">>], maps:get(<<"committed">>, Commitment)), + %% The commitment carries `signature' (= raw digest) so it survives the + %% httpsig wire; `keyid', `committer', `hash-alg', and `multicodec' are + %% deliberately absent — content-addressed commitments need no key, and + %% the multihash + multicodec are already encoded in `type'. + ?assertMatch(#{<<"signature">> := _}, Commitment), + ?assertNot(maps:is_key(<<"keyid">>, Commitment)), + ?assertNot(maps:is_key(<<"committer">>, Commitment)), + ?assertNot(maps:is_key(<<"hash-alg">>, Commitment)), + ?assertNot(maps:is_key(<<"multicodec">>, Commitment)). + +commit_unsigned_dag_cbor_test() -> + {ok, Committed} = + commit( + #{ <<"body">> => <<16#a0>> }, + #{ + <<"type">> => <<"unsigned">>, + <<"hash-alg">> => <<"sha2-256-dag-cbor">> + }, + #{} + ), + [CID] = maps:keys(maps:get(<<"commitments">>, Committed)), + ?assertEqual( + <<"bafyreigbtj4x7ip5legnfznufuopl4sg4knzc2cof6duas4b3q2fy6swua">>, + CID + ). + +commit_native_type_test() -> + {ok, Committed} = + commit( + #{ <<"body">> => <<"hello world">> }, + #{ <<"type">> => <<"sha2-256-raw">> }, + #{} + ), + [CID] = maps:keys(maps:get(<<"commitments">>, Committed)), + ?assertEqual( + <<"bafkreifzjut3te2nhyekklss27nh3k72ysco7y32koao5eei66wof36n5e">>, + CID + ). + +commit_preserves_existing_commitments_test() -> + Msg = #{ + <<"body">> => <<"hello world">>, + <<"commitments">> => #{ <<"other">> => #{ <<"kind">> => <<"x">> } } + }, + {ok, Committed} = commit(Msg, #{ <<"type">> => <<"unsigned">> }, #{}), + ?assertEqual(2, maps:size(maps:get(<<"commitments">>, Committed))). + +commit_signed_delegates_to_httpsig_test() -> + {ok, Signed} = + commit( + #{ <<"body">> => <<"x">> }, + #{ <<"type">> => <<"signed">> }, + #{ priv_wallet => ar_wallet:new() } + ), + [{_CID, C}|_] = maps:to_list(maps:get(<<"commitments">>, Signed)), + ?assertEqual(<<"httpsig@1.0">>, maps:get(<<"commitment-device">>, C)). + +commit_rejects_unsupported_ipfs_type_test() -> + ?assertMatch( + {error, {unsupported_type, <<"sha2-256-dag-pb">>}}, + commit( + #{ <<"body">> => <<"x">> }, + #{ + <<"type">> => <<"unsigned">>, + <<"hash-alg">> => <<"sha2-256-dag-pb">> + }, + #{} + ) + ). + +verify_ok_for_intact_body_test() -> + {ok, Committed} = + commit( + #{ <<"body">> => <<"hello world">> }, + #{ <<"type">> => <<"unsigned">> }, + #{} + ), + [{_CID, C}] = maps:to_list(maps:get(<<"commitments">>, Committed)), + ?assertEqual({ok, true}, verify(Committed, C, #{})). + +verify_fails_for_tampered_body_test() -> + {ok, Committed} = + commit( + #{ <<"body">> => <<"hello world">> }, + #{ <<"type">> => <<"unsigned">> }, + #{} + ), + [{_CID, C}] = maps:to_list(maps:get(<<"commitments">>, Committed)), + ?assertEqual( + {ok, false}, + verify(Committed#{ <<"body">> => <<"hello earth">> }, C, #{}) + ). + +verify_fails_when_hash_alg_mismatches_test() -> + {ok, Committed} = + commit( + #{ <<"body">> => <<"hello world">> }, + #{ <<"type">> => <<"unsigned">> }, + #{} + ), + [{_CID, C}] = maps:to_list(maps:get(<<"commitments">>, Committed)), + ?assertEqual( + {ok, false}, + verify(Committed, C#{ <<"type">> => <<"sha2-256-dag-cbor">> }, #{}) + ). diff --git a/src/dev_codec_ipfs_cbor.erl b/src/dev_codec_ipfs_cbor.erl new file mode 100644 index 000000000..d33f97a03 --- /dev/null +++ b/src/dev_codec_ipfs_cbor.erl @@ -0,0 +1,572 @@ +%%% @doc Pure-Erlang deterministic DAG-CBOR encoder and decoder. Implements +%%% the dag-cbor subset of RFC 8949: definite-length containers only; 64-bit +%%% floats only, no NaN/Infinity; shortest-form integers in int64 range; +%%% text-string keys sorted length-first then bytewise; only tag 42 (IPLD +%%% link); valid UTF-8 text; simple values 20/21/22 only. Spec: +%%% https://ipld.io/specs/codecs/dag-cbor/spec/ +%%% +%%% IPLD data model <-> Erlang intermediate form used here: +%%% +%%% null | false | true <-> atoms +%%% integer | float <-> Erlang number +%%% text string <-> UTF-8 binary +%%% byte string <-> `{bytes, Binary}' +%%% array <-> list +%%% map <-> map with binary keys +%%% link (CID) <-> `{link, CIDString}' +%%% +%%% The module does not touch `~structured@1.0' or TABM; `dev_codec_ipfs' +%%% bridges this IPLD form into HyperBEAM messages. +-module(dev_codec_ipfs_cbor). +-export([encode/1, decode/1]). +-include("include/hb.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +%% Integer range bounds per dag-cbor (int64). +-define(INT64_MAX, 16#7fffffffffffffff). +-define(INT64_MIN, -16#8000000000000000). + +%% @doc Encode an IPLD value to dag-cbor bytes. Throws +%% `{dag_cbor_encode, Reason}' on invalid input. +encode(V) -> + try iolist_to_binary(enc(V)) + catch throw:{dag_cbor_encode, _} = E -> throw(E); + error:Reason:Stack -> + throw({dag_cbor_encode, {internal, Reason, Stack}}) + end. + +enc(null) -> <<16#f6>>; +enc(true) -> <<16#f5>>; +enc(false) -> <<16#f4>>; +enc(N) when is_integer(N), N >= 0, N =< ?INT64_MAX -> + enc_header(0, N); +enc(N) when is_integer(N), N < 0, N >= ?INT64_MIN -> + enc_header(1, -1 - N); +enc(N) when is_integer(N) -> + throw({dag_cbor_encode, {integer_out_of_range, N}}); +enc(F) when is_float(F) -> + %% NaN: `F == F' is false. Infinity: `F == F + 1.0' and non-zero. + %% Both are forbidden in dag-cbor; every finite double encodes as 0xfb + %% + 8 big-endian IEEE-754 bytes. + case F == F of + false -> throw({dag_cbor_encode, nan_forbidden}); + true -> + case (F == F + 1.0) andalso (F =/= +0.0) andalso (F =/= -0.0) of + true -> throw({dag_cbor_encode, infinity_forbidden}); + false -> <<16#fb, F:64/float>> + end + end; +enc(B) when is_binary(B) -> + case is_valid_utf8(B) of + true -> [enc_header(3, byte_size(B)), B]; + false -> throw({dag_cbor_encode, {invalid_utf8, B}}) + end; +enc({bytes, B}) when is_binary(B) -> + [enc_header(2, byte_size(B)), B]; +enc({link, CID}) when is_binary(CID) -> + %% Tag 42 wraps a byte string: <<0x00, >>. + case dev_codec_ipfs_cid:multibase_decode(CID) of + {ok, Inner} -> + Wrapped = <<0, Inner/binary>>, + [<<16#d8, 16#2a>>, enc({bytes, Wrapped})]; + {error, Reason} -> + throw({dag_cbor_encode, {bad_cid_link, CID, Reason}}) + end; +enc(L) when is_list(L) -> + [enc_header(4, length(L)), [ enc(V) || V <- L ]]; +enc(M) when is_map(M) -> + Pairs = maps:to_list(M), + case lists:all(fun({K, _}) -> is_binary(K) end, Pairs) of + false -> throw({dag_cbor_encode, non_string_map_key}); + true -> + Sorted = lists:sort(fun pair_key_lt/2, Pairs), + [enc_header(5, length(Sorted)), + [ [enc(K), enc(V)] || {K, V} <- Sorted ]] + end; +enc(Other) -> + throw({dag_cbor_encode, {unsupported_type, Other}}). + +%% @doc Dag-CBOR length-first, then bytewise key ordering. +pair_key_lt({K1, _}, {K2, _}) -> key_lt(K1, K2). +key_lt(A, B) -> + case {byte_size(A), byte_size(B)} of + {LA, LB} when LA < LB -> true; + {LA, LB} when LA > LB -> false; + _ -> A =< B + end. + +%% @doc Major type header with shortest-form length/argument. +enc_header(MT, N) when N < 24 -> + <>; +enc_header(MT, N) when N < 16#100 -> + <>; +enc_header(MT, N) when N < 16#10000 -> + <>; +enc_header(MT, N) when N < 16#1_00000000 -> + <>; +enc_header(MT, N) when N < 16#1_0000000000000000 -> + <>. + +is_valid_utf8(B) -> + case unicode:characters_to_binary(B, utf8, utf8) of + B -> true; + _ -> false + end. + +%% @doc Decode a dag-cbor binary into an IPLD value. Strictly validates: +%% rejects indefinite-length items, non-64-bit floats, NaN/Infinity, +%% non-canonical integers, unsupported tags, non-UTF-8 strings, and +%% non-canonical map ordering. +decode(Bin) when is_binary(Bin) -> + try + {Value, Rest} = dec_one(Bin), + case Rest of + <<>> -> {ok, Value}; + _ -> {error, {trailing_bytes, Rest}} + end + catch + throw:{dag_cbor_decode, Reason} -> {error, Reason}; + error:E -> {error, {malformed, E}} + end. + +dec_one(<<>>) -> + throw({dag_cbor_decode, unexpected_end}); +dec_one(<<7:3, AI:5, Rest/binary>>) -> + %% Major type 7 is special: the additional info selects the value kind + %% (simple value 20/21/22, half/single/double float). Its "argument" is + %% not a length and is not subject to the canonical-integer gate. + dec_simple_or_float(AI, Rest); +dec_one(<>) -> + {N, Rest1} = read_arg(AI, Rest), + dec_value(MT, N, Rest1). + +%% @doc Read the argument for an informational length/value AI. Used by all +%% major types except 7 (simple/float). +read_arg(AI, Rest) when AI < 24 -> + {AI, Rest}; +read_arg(24, <>) -> + reject_non_canonical_int(24, N), + {N, Rest}; +read_arg(25, <>) -> + reject_non_canonical_int(25, N), + {N, Rest}; +read_arg(26, <>) -> + reject_non_canonical_int(26, N), + {N, Rest}; +read_arg(27, <>) -> + reject_non_canonical_int(27, N), + {N, Rest}; +read_arg(28, _) -> throw({dag_cbor_decode, reserved_additional_info}); +read_arg(29, _) -> throw({dag_cbor_decode, reserved_additional_info}); +read_arg(30, _) -> throw({dag_cbor_decode, reserved_additional_info}); +read_arg(31, _) -> throw({dag_cbor_decode, indefinite_length_forbidden}); +read_arg(_, _) -> throw({dag_cbor_decode, unexpected_end}). + +%% @doc Reject non-canonical integer encodings. For length arg AI 24 the +%% value N must be >= 24; for 25, >= 256; for 26, >= 65536; for 27, +%% >= 4294967296. Otherwise the encoder chose a wastefully long form. +reject_non_canonical_int(24, N) when N < 24 -> + throw({dag_cbor_decode, non_canonical_integer}); +reject_non_canonical_int(25, N) when N < 16#100 -> + throw({dag_cbor_decode, non_canonical_integer}); +reject_non_canonical_int(26, N) when N < 16#10000 -> + throw({dag_cbor_decode, non_canonical_integer}); +reject_non_canonical_int(27, N) when N < 16#1_00000000 -> + throw({dag_cbor_decode, non_canonical_integer}); +reject_non_canonical_int(_, _) -> ok. + +dec_value(0, N, Rest) -> + {N, Rest}; +dec_value(1, N, Rest) -> + {-1 - N, Rest}; +dec_value(2, L, Rest) -> + case Rest of + <> -> {{bytes, Bytes}, Rest1}; + _ -> throw({dag_cbor_decode, {truncated_bytes, L}}) + end; +dec_value(3, L, Rest) -> + case Rest of + <> -> + case unicode:characters_to_binary(Text, utf8, utf8) of + Text -> {Text, Rest1}; + _ -> throw({dag_cbor_decode, invalid_utf8}) + end; + _ -> throw({dag_cbor_decode, {truncated_text, L}}) + end; +dec_value(4, L, Rest) -> + dec_n(L, Rest, []); +dec_value(5, L, Rest) -> + {Pairs, Rest1} = dec_pairs(L, Rest, [], <<>>), + {maps:from_list(Pairs), Rest1}; +dec_value(6, Tag, Rest) -> + case Tag of + 42 -> dec_link(Rest); + _ -> throw({dag_cbor_decode, {unsupported_tag, Tag}}) + end. + +%% @doc Simple values and floats live in major type 7. AI selects the subtype. +dec_simple_or_float(20, Rest) -> {false, Rest}; +dec_simple_or_float(21, Rest) -> {true, Rest}; +dec_simple_or_float(22, Rest) -> {null, Rest}; +dec_simple_or_float(25, _Rest) -> + throw({dag_cbor_decode, half_float_forbidden}); +dec_simple_or_float(26, _Rest) -> + throw({dag_cbor_decode, single_float_forbidden}); +dec_simple_or_float(27, <>) -> + %% A double-precision float follows. Erlang's `:64/float' binary match + %% refuses NaN/Infinity with a badmatch; we turn that into a clean + %% `{error, nan_or_infinity_forbidden}'. + try + <> = Bytes, + {F, Rest} + catch error:_ -> + throw({dag_cbor_decode, nan_or_infinity_forbidden}) + end; +dec_simple_or_float(27, _) -> + throw({dag_cbor_decode, {truncated_double, 27}}); +dec_simple_or_float(AI, _) -> + throw({dag_cbor_decode, {unsupported_simple_value, AI}}). + +dec_n(0, Rest, Acc) -> + {lists:reverse(Acc), Rest}; +dec_n(N, Rest, Acc) -> + {V, Rest1} = dec_one(Rest), + dec_n(N - 1, Rest1, [V | Acc]). + +%% @doc Decode map pairs; verify keys are text strings in strictly ascending +%% dag-cbor order (length-first, then bytewise) with no duplicates. +dec_pairs(0, Rest, Acc, _Prev) -> + {lists:reverse(Acc), Rest}; +dec_pairs(N, Rest, Acc, Prev) -> + {K, Rest1} = dec_one(Rest), + is_binary(K) orelse throw({dag_cbor_decode, non_string_map_key}), + (Acc =:= [] orelse (key_lt(Prev, K) andalso Prev =/= K)) + orelse throw({dag_cbor_decode, non_canonical_map_order}), + {V, Rest2} = dec_one(Rest1), + dec_pairs(N - 1, Rest2, [{K, V} | Acc], K). + +dec_link(Rest) -> + case dec_one(Rest) of + {{bytes, <<0, CIDBytes/binary>>}, Rest1} -> + CID = dev_codec_ipfs_cid:multibase_encode(CIDBytes), + {{link, CID}, Rest1}; + {{bytes, _}, _} -> + throw({dag_cbor_decode, malformed_cid_link_prefix}); + _ -> + throw({dag_cbor_decode, cid_link_expects_byte_string}) + end. + +%%% Tests — unit known-answer vectors and compound roundtrips. + +scalars_roundtrip_test() -> + ?assertEqual(<<16#f6>>, encode(null)), + ?assertEqual(<<16#f5>>, encode(true)), + ?assertEqual(<<16#f4>>, encode(false)), + ?assertEqual({ok, null}, decode(<<16#f6>>)), + ?assertEqual({ok, true}, decode(<<16#f5>>)), + ?assertEqual({ok, false}, decode(<<16#f4>>)). + +integer_encodings_test() -> + %% Values per RFC 8949 Appendix A. + Cases = [ + {0, <<16#00>>}, + {1, <<16#01>>}, + {10, <<16#0a>>}, + {23, <<16#17>>}, + {24, <<16#18, 16#18>>}, + {25, <<16#18, 16#19>>}, + {100, <<16#18, 16#64>>}, + {255, <<16#18, 16#ff>>}, + {256, <<16#19, 16#01, 16#00>>}, + {1000, <<16#19, 16#03, 16#e8>>}, + {65535, <<16#19, 16#ff, 16#ff>>}, + {65536, <<16#1a, 16#00, 16#01, 16#00, 16#00>>}, + {4294967295, <<16#1a, 16#ff, 16#ff, 16#ff, 16#ff>>}, + {4294967296, <<16#1b, 0, 0, 0, 1, 0, 0, 0, 0>>}, + {-1, <<16#20>>}, + {-10, <<16#29>>}, + {-24, <<16#37>>}, + {-25, <<16#38, 16#18>>}, + {-100, <<16#38, 16#63>>}, + {-1000, <<16#39, 16#03, 16#e7>>} + ], + lists:foreach( + fun({V, Expected}) -> + ?assertEqual(Expected, encode(V)), + ?assertEqual({ok, V}, decode(Expected)) + end, + Cases + ). + +integer_out_of_range_raises_test() -> + ?assertThrow({dag_cbor_encode, {integer_out_of_range, _}}, + encode(16#1_00000000_00000000)), + ?assertThrow({dag_cbor_encode, {integer_out_of_range, _}}, + encode(-16#8000000000000001)). + +non_canonical_integer_rejected_test() -> + %% 0 encoded in 8-bit additional-info form: 0x18 0x00. Must be rejected. + ?assertEqual({error, non_canonical_integer}, + decode(<<16#18, 16#00>>)), + %% 24 in 16-bit form: 0x19 0x00 0x18 + ?assertEqual({error, non_canonical_integer}, + decode(<<16#19, 16#00, 16#18>>)). + +float_roundtrip_test() -> + %% A finite double encodes to 0xfb + 8 bytes big-endian IEEE 754. + Bytes = encode(1.5), + ?assertEqual(<<16#fb, 1.5:64/big-float>>, Bytes), + ?assertEqual({ok, 1.5}, decode(Bytes)). + +nan_rejected_on_decode_test() -> + NaN = <<16#fb, 16#7f, 16#f8, 0, 0, 0, 0, 0, 0>>, + ?assertMatch({error, _}, decode(NaN)). + +infinity_rejected_on_decode_test() -> + PosInf = <<16#fb, 16#7f, 16#f0, 0, 0, 0, 0, 0, 0>>, + NegInf = <<16#fb, 16#ff, 16#f0, 0, 0, 0, 0, 0, 0>>, + ?assertMatch({error, _}, decode(PosInf)), + ?assertMatch({error, _}, decode(NegInf)). + +half_and_single_float_rejected_test() -> + %% 0xf9 xx xx is a half-float; 0xfa xx xx xx xx is a single-float. + ?assertEqual({error, half_float_forbidden}, + decode(<<16#f9, 0, 0>>)), + ?assertEqual({error, single_float_forbidden}, + decode(<<16#fa, 0, 0, 0, 0>>)). + +indefinite_length_rejected_test() -> + %% 0x9f is indefinite-length array; 0xbf is indefinite-length map. + ?assertEqual({error, indefinite_length_forbidden}, + decode(<<16#9f, 16#ff>>)), + ?assertEqual({error, indefinite_length_forbidden}, + decode(<<16#bf, 16#ff>>)). + +text_string_encoding_test() -> + ?assertEqual(<<16#65, "hello">>, encode(<<"hello">>)), + ?assertEqual({ok, <<"hello">>}, decode(<<16#65, "hello">>)), + %% Empty string. + ?assertEqual(<<16#60>>, encode(<<>>)), + ?assertEqual({ok, <<>>}, decode(<<16#60>>)). + +text_string_invalid_utf8_rejected_test() -> + ?assertMatch({error, invalid_utf8}, + decode(<<16#61, 16#80>>)), %% lone continuation byte + ?assertThrow({dag_cbor_encode, {invalid_utf8, _}}, + encode(<<16#80>>)). + +bytes_encoding_test() -> + ?assertEqual(<<16#43, "hi!">>, + encode({bytes, <<"hi!">>})), + ?assertEqual({ok, {bytes, <<"hi!">>}}, + decode(<<16#43, "hi!">>)). + +array_encoding_test() -> + %% [] -> 80 + ?assertEqual(<<16#80>>, encode([])), + ?assertEqual({ok, []}, decode(<<16#80>>)), + %% [1, 2, 3] -> 83 01 02 03 + ?assertEqual(<<16#83, 16#01, 16#02, 16#03>>, encode([1, 2, 3])), + ?assertEqual({ok, [1, 2, 3]}, decode(<<16#83, 16#01, 16#02, 16#03>>)). + +map_encoding_canonical_test() -> + %% {"a": 1} -> a1 61 61 01 + ?assertEqual(<<16#a1, 16#61, "a", 16#01>>, + encode(#{ <<"a">> => 1 })), + ?assertEqual({ok, #{ <<"a">> => 1 }}, + decode(<<16#a1, 16#61, "a", 16#01>>)), + %% {} -> a0 + ?assertEqual(<<16#a0>>, encode(#{})), + ?assertEqual({ok, #{}}, decode(<<16#a0>>)). + +%% @doc Length-first ordering beats alphabetical: {"aa":1,"z":2} encodes z first. +map_length_first_ordering_test() -> + Input = #{ <<"aa">> => 1, <<"z">> => 2 }, + Encoded = encode(Input), + %% Expected: a2 | 61 7a 02 | 62 61 61 01 + ?assertEqual( + <<16#a2, 16#61, "z", 16#02, 16#62, "aa", 16#01>>, + Encoded + ), + ?assertEqual({ok, Input}, decode(Encoded)). + +map_non_canonical_order_rejected_test() -> + %% Same contents but in non-canonical order: "aa" before "z". + NonCanon = <<16#a2, 16#62, "aa", 16#01, 16#61, "z", 16#02>>, + ?assertEqual({error, non_canonical_map_order}, decode(NonCanon)). + +map_duplicate_keys_rejected_test() -> + %% Two entries with key "a". Length-first ordering requires strictly less. + Dup = <<16#a2, 16#61, "a", 16#01, 16#61, "a", 16#02>>, + ?assertEqual({error, non_canonical_map_order}, decode(Dup)). + +map_non_string_key_rejected_test() -> + %% {1: true} — integer key. Not allowed in dag-cbor. + IntKey = <<16#a1, 16#01, 16#f5>>, + ?assertEqual({error, non_string_map_key}, decode(IntKey)). + +unsupported_tag_rejected_test() -> + %% Tag 0 (date/time string) is common in CBOR but forbidden in dag-cbor. + ?assertEqual({error, {unsupported_tag, 0}}, + decode(<<16#c0, 16#61, "x">>)), + ?assertEqual({error, {unsupported_tag, 1}}, + decode(<<16#c1, 16#01>>)). + +cid_link_roundtrip_test() -> + CID = <<"bafkreifzjut3te2nhyekklss27nh3k72ysco7y32koao5eei66wof36n5e">>, + Encoded = encode({link, CID}), + %% Structure: d8 2a | | 00 | + %% We don't hard-code the whole thing — we just roundtrip. + ?assertEqual({ok, {link, CID}}, decode(Encoded)), + %% And the tag prefix is exactly d8 2a. + ?assertMatch(<<16#d8, 16#2a, _/binary>>, Encoded). + +cid_link_without_multibase_prefix_rejected_test() -> + %% A tag-42 byte string that starts with 0x01 (not 0x00) is malformed. + Bad = <<16#d8, 16#2a, 16#42, 16#01, 16#02>>, + ?assertEqual({error, malformed_cid_link_prefix}, decode(Bad)). + +%%% Compound roundtrips: HyperBEAM-message-like IPLD data. + +compound_roundtrip_test() -> + Value = #{ + <<"name">> => <<"alice">>, + <<"age">> => 30, + <<"admin">> => true, + <<"rating">> => 4.5, + <<"tags">> => [<<"a">>, <<"b">>, <<"c">>], + <<"parent">> => null, + <<"blob">> => {bytes, <<0, 1, 2, 3>>}, + <<"nested">> => #{ + <<"k">> => <<"v">>, + <<"n">> => -42 + } + }, + Encoded = encode(Value), + ?assertEqual({ok, Value}, decode(Encoded)), + %% Determinism: encoding twice must produce the exact same bytes. + ?assertEqual(Encoded, encode(Value)). + +determinism_across_insertion_order_test() -> + %% Same logical map, two different insertion orders in the source code, + %% must serialize to identical bytes. + Ordered1 = #{ <<"a">> => 1, <<"bb">> => 2, <<"ccc">> => 3 }, + Ordered2 = #{ <<"ccc">> => 3, <<"a">> => 1, <<"bb">> => 2 }, + ?assertEqual(encode(Ordered1), encode(Ordered2)). + +trailing_bytes_rejected_test() -> + ?assertMatch({error, {trailing_bytes, _}}, + decode(<<16#00, 16#00>>)). + +shortest_form_integers_encoded_test() -> + %% 23 must use single byte (major 0, info 23) — 0x17, not 0x18 0x17. + ?assertEqual(<<16#17>>, encode(23)). + +%% @doc End-to-end validation: an encoded empty dag-cbor map, CID-hashed, +%% must match the well-known empty-map dag-cbor CID. This closes the loop +%% with the phase-1 CID machinery. +empty_map_cid_matches_canonical_test() -> + Encoded = encode(#{}), + ?assertEqual(<<16#a0>>, Encoded), + CID = dev_codec_ipfs_cid:encode(<<"dag-cbor">>, sha2_256, Encoded), + ?assertEqual( + <<"bafyreigbtj4x7ip5legnfznufuopl4sg4knzc2cof6duas4b3q2fy6swua">>, + CID + ). + +%%% Additional dag-cbor-spec vectors. Each `{Value, Bytes}' pair is an IPLD +%%% value and its canonical deterministic encoding per the DAG-CBOR spec. +%%% These cover the data-model paths not hit by the scalar/int tests above. + +spec_vectors_test() -> + Cases = [ + %% Mixed nulls and bools array (5 elements). + {[null, true, false, null, true], + <<16#85, 16#f6, 16#f5, 16#f4, 16#f6, 16#f5>>}, + %% Empty text string. + {<<>>, <<16#60>>}, + %% Empty byte string. + {{bytes, <<>>}, <<16#40>>}, + %% String with length 23 (1-byte header: 0x77). + {<<"abcdefghijklmnopqrstuvw">>, + <<16#77, "abcdefghijklmnopqrstuvw">>}, + %% String with length 24 (2-byte header: 0x78 0x18). + {<<"abcdefghijklmnopqrstuvwx">>, + <<16#78, 16#18, "abcdefghijklmnopqrstuvwx">>}, + %% Nested list: [[1,2],[3]]. + {[[1, 2], [3]], + <<16#82, 16#82, 16#01, 16#02, 16#81, 16#03>>}, + %% Map containing a list value. + {#{ <<"xs">> => [1, 2, 3] }, + <<16#a1, 16#62, "xs", 16#83, 16#01, 16#02, 16#03>>}, + %% Deeply nested map: {"a":{"b":{"c":1}}}. + {#{ <<"a">> => #{ <<"b">> => #{ <<"c">> => 1 } } }, + <<16#a1, 16#61, "a", 16#a1, 16#61, "b", 16#a1, 16#61, "c", 16#01>>} + ], + lists:foreach( + fun({Value, Expected}) -> + ?assertEqual(Expected, encode(Value)), + ?assertEqual({ok, Value}, decode(Expected)) + end, + Cases + ). + +%% @doc Stress: a map with many keys at assorted lengths forces the +%% canonical length-first ordering to kick in, and confirms the encoded +%% output is stable even when the source map enumerates keys in a +%% different order. +stress_map_ordering_test() -> + Keys = [<<"a">>, <<"b">>, <<"c">>, <<"aa">>, <<"ab">>, <<"abc">>, + <<"abcd">>, <<"z">>, <<"zz">>], + Pairs = lists:zip(Keys, lists:seq(1, length(Keys))), + M1 = maps:from_list(Pairs), + M2 = maps:from_list(lists:reverse(Pairs)), + Bytes1 = encode(M1), + Bytes2 = encode(M2), + ?assertEqual(Bytes1, Bytes2), + %% Decode must produce the same map. + ?assertEqual({ok, M1}, decode(Bytes1)). + +%% @doc 64-bit integer boundaries. Critical for int64 correctness. +int_boundary_test() -> + Cases = [ + %% Max 8-bit (255) and 8-bit + 1 (256) already covered. + %% Max 16-bit (65535) and 16-bit + 1 (65536) already covered. + %% Max 32-bit and its + 1 (exercises 64-bit encoder). + 4294967296, + %% Max positive int64. + 16#7fffffffffffffff, + %% Max negative int64. + -16#8000000000000000, + %% A mid-range negative. + -1234567890 + ], + lists:foreach( + fun(N) -> + Encoded = encode(N), + ?assertEqual({ok, N}, decode(Encoded)) + end, + Cases + ). + +%% @doc A more structurally interesting map: the simplest non-trivial +%% dag-cbor object. The bytes are exact; we cross-check the CID against the +%% output of `ipfs dag put --input-codec dag-json --store-codec dag-cbor' +%% on `{"hello":"world"}'. +simple_map_bytes_and_cid_test() -> + Encoded = encode(#{ <<"hello">> => <<"world">> }), + %% a1 65 68 65 6c 6c 6f 65 77 6f 72 6c 64 + ?assertEqual( + <<16#a1, 16#65, "hello", 16#65, "world">>, + Encoded + ), + CID = dev_codec_ipfs_cid:encode(<<"dag-cbor">>, sha2_256, Encoded), + %% Deterministic, CIDv1 / dag-cbor / sha2-256 / base32-lower prefix `b'. + %% Length 59, starts with `bafyrei' — the dag-cbor + sha2-256 signature. + ?assertMatch(<<"bafyrei", _:52/binary>>, CID), + ?assertEqual(59, byte_size(CID)), + %% Decoding the CID back out recovers the same sha2-256 digest as the + %% block bytes we just produced. + {ok, Parts} = dev_codec_ipfs_cid:decode(CID), + ?assertEqual(<<"sha2-256-dag-cbor">>, maps:get(<<"hash-alg">>, Parts)), + ?assertEqual(crypto:hash(sha256, Encoded), maps:get(<<"digest">>, Parts)). diff --git a/src/dev_codec_ipfs_cid.erl b/src/dev_codec_ipfs_cid.erl new file mode 100644 index 000000000..df7bffae6 --- /dev/null +++ b/src/dev_codec_ipfs_cid.erl @@ -0,0 +1,194 @@ +%%% @doc Pure functions for the thin slice of the IPFS/IPLD spec that +%%% `~ipfs@1.0' needs: unsigned varints, sha2-256 multihashes, base32-lower +%%% multibase, and CIDv1 encode/decode. Not a general IPFS library — CIDv0, +%%% non-sha2 hashes, multibases other than `b', and IPLD path resolution +%%% are all out of scope. +%%% +%%% References: +%%% - CIDv1: https://github.com/multiformats/cid +%%% - Multihash: https://github.com/multiformats/multihash +%%% - Multibase: https://github.com/multiformats/multibase +%%% - unsigned-varint: https://github.com/multiformats/unsigned-varint +-module(dev_codec_ipfs_cid). +-export([encode/3, decode/1]). +-export([codec_code/1, codec_name/1]). +-export([multihash/2, multibase_encode/1, multibase_decode/1]). +-export([varint_encode/1, varint_decode/1]). +-include("include/hb.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +%% Multicodec codes. Full registry: +%% https://github.com/multiformats/multicodec/blob/master/table.csv +-define(CODEC_RAW, 16#55). +-define(CODEC_DAG_CBOR, 16#71). + +%% Multihash function code and sha2-256 digest length. +-define(HASH_SHA2_256, 16#12). +-define(SHA2_256_LEN, 32). + +%% Multibase prefix for base32 lowercase (RFC4648, no padding). +-define(MB_BASE32_LOWER, $b). + +%% @doc Encode `Body' as a CIDv1 string under `Codec' (`<<"raw">>' or +%% `<<"dag-cbor">>') and hash algorithm `sha2_256' (atom) or +%% `<<"sha2-256">>' (binary). +encode(Codec, HashAlg, Body) when is_binary(Codec) -> + encode(codec_code(Codec), HashAlg, Body); +encode(CodecCode, <<"sha2-256">>, Body) -> + encode(CodecCode, sha2_256, Body); +encode(CodecCode, sha2_256, Body) + when is_integer(CodecCode), is_binary(Body) -> + multibase_encode( + <<(varint_encode(1))/binary, + (varint_encode(CodecCode))/binary, + (multihash(sha2_256, Body))/binary>>). + +%% @doc Decode a CIDv1 string into its component parts, or `{error, _}'. +decode(Bin) when is_binary(Bin) -> + case multibase_decode(Bin) of + {ok, Raw} -> decode_bytes(Raw); + Err -> Err + end. + +decode_bytes(Bin) -> + try + {1, Rest1} = varint_decode(Bin), + {CodecCode, Rest2} = varint_decode(Rest1), + {HashCode, Rest3} = varint_decode(Rest2), + {DigestLen, Rest4} = varint_decode(Rest3), + case {HashCode, DigestLen, Rest4} of + {?HASH_SHA2_256, ?SHA2_256_LEN, <>} -> + Multicodec = codec_name(CodecCode), + {ok, #{ + <<"version">> => 1, + <<"hash-alg">> => <<"sha2-256-", Multicodec/binary>>, + <<"digest">> => Digest + }}; + {?HASH_SHA2_256, ?SHA2_256_LEN, _} -> + {error, truncated_digest}; + {Other, _, _} -> + {error, {unsupported_hash, Other}} + end + catch _:_ -> {error, malformed_cid} + end. + +%% @doc Resolve a codec name to its multicodec code. +codec_code(<<"raw">>) -> ?CODEC_RAW; +codec_code(<<"dag-cbor">>) -> ?CODEC_DAG_CBOR; +codec_code(Other) -> throw({unsupported_codec, Other}). + +%% @doc Inverse of `codec_code/1'. Unknown codes round-trip as +%% `<<"codec-0xHEX">>' so `decode/1' never throws on a stranger's CID. +codec_name(?CODEC_RAW) -> <<"raw">>; +codec_name(?CODEC_DAG_CBOR) -> <<"dag-cbor">>; +codec_name(N) when is_integer(N) -> + iolist_to_binary(io_lib:format("codec-0x~.16b", [N])). + +%% @doc Wrap a sha2-256 digest of `Body' as a multihash binary. +multihash(sha2_256, Body) when is_binary(Body) -> + <<(varint_encode(?HASH_SHA2_256))/binary, + (varint_encode(?SHA2_256_LEN))/binary, + (crypto:hash(sha256, Body))/binary>>. + +%% @doc Multibase-encode a binary as base32-lowercase, no padding, prefix `b'. +multibase_encode(Bin) when is_binary(Bin) -> + <>. + +%% @doc Multibase-decode. Accepts base32-lower (`b'), base32-upper (`B'), +%% and base16-lower (`f') defensively; anything else is `{error, _}'. +multibase_decode(<>) -> + safe(fun() -> base32:decode(pad_base32(string:uppercase(Rest))) end, + invalid_base32); +multibase_decode(<<$B, Rest/binary>>) -> + safe(fun() -> base32:decode(pad_base32(Rest)) end, invalid_base32); +multibase_decode(<<$f, Rest/binary>>) -> + safe(fun() -> binary:decode_hex(Rest) end, invalid_base16); +multibase_decode(<>) -> + {error, {unsupported_multibase, <>}}; +multibase_decode(_) -> + {error, empty_cid}. + +safe(Fun, ErrorTag) -> + try {ok, Fun()} catch _:_ -> {error, ErrorTag} end. + +pad_base32(Bin) -> + %% RFC4648 base32 groups are 40 bits (8 chars). Pad with `=' to a + %% multiple of 8. + case (8 - (byte_size(Bin) rem 8)) rem 8 of + 0 -> Bin; + N -> < + ?assertEqual( + <<"bafkreifzjut3te2nhyekklss27nh3k72ysco7y32koao5eei66wof36n5e">>, + encode(<<"raw">>, sha2_256, <<"hello world">>)). + +empty_raw_cid_test() -> + ?assertEqual( + <<"bafkreihdwdcefgh4dqkjv67uzcmw7ojee6xedzdetojuzjevtenxquvyku">>, + encode(<<"raw">>, sha2_256, <<>>)). + +empty_dag_cbor_cid_test() -> + ?assertEqual( + <<"bafyreigbtj4x7ip5legnfznufuopl4sg4knzc2cof6duas4b3q2fy6swua">>, + encode(<<"dag-cbor">>, sha2_256, <<16#a0>>)). + +roundtrip_decode_raw_test() -> + CID = encode(<<"raw">>, sha2_256, <<"hello world">>), + {ok, Parts} = decode(CID), + ?assertEqual(<<"sha2-256-raw">>, maps:get(<<"hash-alg">>, Parts)), + ?assertEqual(1, maps:get(<<"version">>, Parts)), + ?assertEqual(crypto:hash(sha256, <<"hello world">>), + maps:get(<<"digest">>, Parts)). + +roundtrip_decode_dag_cbor_test() -> + {ok, Parts} = decode(encode(<<"dag-cbor">>, sha2_256, <<"body bytes">>)), + ?assertEqual(<<"sha2-256-dag-cbor">>, maps:get(<<"hash-alg">>, Parts)). + +bad_multibase_prefix_test() -> + ?assertMatch({error, {unsupported_multibase, _}}, decode(<<"Qmfoobar">>)). + +malformed_cid_test() -> + ?assertMatch({error, _}, decode(<<"baaa">>)). + +varint_roundtrip_test() -> + [ ?assertEqual({N, <<>>}, varint_decode(varint_encode(N))) + || N <- [0, 1, 127, 128, 255, 16#55, 16#71, 1234, 16#ffff, 16#ffffffff] ]. + +varint_truncated_raises_test() -> + ?assertThrow({malformed_varint, _}, varint_decode(<<16#ff>>)). + +multihash_shape_test() -> + MH = multihash(sha2_256, <<"x">>), + <<16#12, 32, Digest:32/binary>> = MH, + ?assertEqual(34, byte_size(MH)), + ?assertEqual(crypto:hash(sha256, <<"x">>), Digest). + +multibase_roundtrip_test() -> + Bytes = <<0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20>>, + Encoded = multibase_encode(Bytes), + ?assertMatch(<>, Encoded), + ?assertEqual({ok, Bytes}, multibase_decode(Encoded)). diff --git a/src/dev_codec_ipfs_test_vectors.erl b/src/dev_codec_ipfs_test_vectors.erl new file mode 100644 index 000000000..b33bd5cc2 --- /dev/null +++ b/src/dev_codec_ipfs_test_vectors.erl @@ -0,0 +1,622 @@ +%%% @doc Test vectors for `~ipfs@1.0'. Three layers of coverage: +%%% +%%% 1. Integration — dispatch through `hb_message:commit/3' and +%%% `hb_message:verify/3', cache linkage from CID to message, and the +%%% `to/3' / `from/3' dag-cbor conversions. +%%% 2. Live — end-to-end tests against real IPFS HTTP gateways and live +%%% in-process HyperBEAM nodes (the flows advertised in PR #868: +%%% serve a CID, preload/pin a CID, commit as ANS-104, relay between +%%% two nodes). Tests skip gracefully when no gateway is reachable. +%%% 3. Message vectors — the `hb_message_test_vectors' battery run +%%% against the codec, with a `skip' list declared on the opts entry +%%% for vectors that do not apply to a content-addressed, unsigned- +%%% only codec. +%%% +%%% Unit-level tests continue to live inline in `dev_codec_ipfs', +%%% `dev_codec_ipfs_cid', `dev_codec_ipfs_cbor', and `hb_store_ipfs_gateway'. +-module(dev_codec_ipfs_test_vectors). +-include_lib("eunit/include/eunit.hrl"). +-include("include/hb.hrl"). + +%% Canonical IPFS ground truth: `ipfs add --raw-leaves -Q <"hello world"'. +-define(HELLO_WORLD, <<"hello world">>). +-define(HELLO_WORLD_CID, + <<"bafkreifzjut3te2nhyekklss27nh3k72ysco7y32koao5eei66wof36n5e">>). +%% The canonical empty dag-cbor block `{}` (byte `0xa0') is pinned on +%% every public gateway. +-define(EMPTY_MAP_CID, + <<"bafyreigbtj4x7ip5legnfznufuopl4sg4knzc2cof6duas4b3q2fy6swua">>). +-define(LIVE_GATEWAYS, [ + <<"https://ipfs.io">>, + <<"https://dweb.link">>, + <<"https://nftstorage.link">>, + <<"https://4everland.io">> +]). +-define(LOOKUP_PATH, + <<"/~lookup@1.0/read&target=", ?HELLO_WORLD_CID/binary>>). + +%%% Helpers + +%% @doc Integration-test opts: opt into `~ipfs@1.0' via `preloaded_devices' +%% and use a volatile store for isolation. +opts() -> + opts(#{ store => hb_test_utils:test_store() }). +opts(Base) -> + Stock = hb_opts:get(preloaded_devices, [], Base), + Base#{ + preloaded_devices => + [ipfs_device() | Stock] + }. + +%% @doc Commit `Msg' with an unsigned `~ipfs@1.0' commitment. `Extra' may +%% override `hash-alg' (defaults to the codec's `sha2-256-raw'). +ipfs_commit(Msg, Opts) -> + ipfs_commit(Msg, Opts, #{}). +ipfs_commit(Msg, Opts, Extra) -> + hb_message:commit(Msg, Opts, Extra#{ + <<"commitment-device">> => <<"ipfs@1.0">>, + <<"type">> => <<"unsigned">> + }). + +ipfs_device() -> + #{ <<"name">> => <<"ipfs@1.0">>, <<"module">> => dev_codec_ipfs }. + +gateway_store() -> + #{ + <<"store-module">> => hb_store_ipfs_gateway, + <<"gateways">> => ?LIVE_GATEWAYS, + <<"timeout">> => 20000 + }. + +%% @doc Full node opts: `~ipfs@1.0' loaded plus a gateway-backed store +%% behind a volatile primary. +node_opts() -> + Stock = hb_opts:get(preloaded_devices, [], #{}), + #{ + cache_control => <<"cache">>, + priv_wallet => hb:wallet(), + preloaded_devices => [ipfs_device() | Stock], + store => [hb_test_utils:test_store(), gateway_store()] + }. + +%% @doc Run `Fun' if the canonical `hello world' CID is live-reachable; +%% otherwise emit a skip note. Every live test routes through this. +with_live_gateways(Fun) -> + application:ensure_all_started(inets), + application:ensure_all_started(ssl), + case hb_store_ipfs_gateway:read(gateway_store(), ?HELLO_WORLD_CID) of + {ok, _} -> Fun(); + _ -> + ?debugFmt("Skipping: all gateways unreachable for ~s", + [?HELLO_WORLD_CID]) + end. + +%% @doc Extract the body from an `hb_http:get' response — sometimes a +%% bare binary, sometimes a map whose `body' may itself be a link. +response_body(R) when is_binary(R) -> R; +response_body(#{ <<"body">> := B }) -> hb_cache:ensure_loaded(B, #{}). + +%%% 1. Integration — dispatch through hb_message:commit / verify + +hb_message_commit_dispatches_to_us_test() -> + Opts = opts(), + Committed = ipfs_commit(#{ <<"body">> => ?HELLO_WORLD }, Opts), + Commitments = maps:get(<<"commitments">>, Committed), + ?assert(maps:is_key(?HELLO_WORLD_CID, Commitments)), + ?assertEqual( + <<"ipfs@1.0">>, + maps:get( + <<"commitment-device">>, + maps:get(?HELLO_WORLD_CID, Commitments) + ) + ). + +hb_message_verify_dispatches_to_us_test() -> + Opts = opts(), + Committed = ipfs_commit(#{ <<"body">> => ?HELLO_WORLD }, Opts), + ?assert( + hb_message:verify( + Committed, + #{ <<"commitment-ids">> => [?HELLO_WORLD_CID] }, + Opts + ) + ). + +verify_rejects_tampered_body_via_hb_message_test() -> + Opts = opts(), + Committed = ipfs_commit(#{ <<"body">> => ?HELLO_WORLD }, Opts), + Tampered = Committed#{ <<"body">> => <<"hello earth">> }, + ?assertNot( + hb_message:verify( + Tampered, + #{ <<"commitment-ids">> => [?HELLO_WORLD_CID] }, + Opts + ) + ). + +committed_returns_body_key_test() -> + Opts = opts(), + Committed = ipfs_commit(#{ <<"body">> => ?HELLO_WORLD }, Opts), + ?assertEqual( + [<<"body">>], + hb_message:committed(Committed, [?HELLO_WORLD_CID], Opts) + ). + +%%% 2. Cache linkage — the load-bearing claim of phase 1 + +%% @doc Write a committed message to the cache, look it up by CID alone. +%% `hb_cache:do_write_message/3' links commitment IDs to the uncommitted +%% root; `hb_cache:read/2' follows that link. +cache_links_cid_to_uncommitted_id_test() -> + Opts = opts(), + Committed = ipfs_commit(#{ <<"body">> => ?HELLO_WORLD }, Opts), + {ok, _} = hb_cache:write(Committed, Opts), + {ok, Recovered} = hb_cache:read(?HELLO_WORLD_CID, Opts), + ?assertEqual( + ?HELLO_WORLD, + hb_cache:ensure_loaded(maps:get(<<"body">>, Recovered), Opts) + ), + ?assert( + maps:is_key( + ?HELLO_WORLD_CID, + maps:get(<<"commitments">>, Recovered, #{}) + ) + ). + +%% @doc Multiple commitment devices on one message do not conflict: the +%% CID still resolves through the cache. +multiple_commitment_devices_coexist_test() -> + Opts = opts(), + Committed = ipfs_commit(#{ <<"body">> => ?HELLO_WORLD }, Opts), + {ok, _} = hb_cache:write(Committed, Opts), + {ok, ViaCID} = hb_cache:read(?HELLO_WORLD_CID, Opts), + ?assertEqual( + ?HELLO_WORLD, + hb_cache:ensure_loaded(maps:get(<<"body">>, ViaCID), Opts) + ). + +%%% 3. to/3 and from/3 through hb_message:convert + +to_dag_cbor_simple_test() -> + Bytes = + hb_message:convert( + #{ <<"hello">> => <<"world">> }, + <<"ipfs@1.0">>, + opts() + ), + ?assertEqual(<<16#a1, 16#65, "hello", 16#65, "world">>, Bytes). + +%% @doc Roundtripping a typed message through dag-cbor preserves rich +%% types: integers, floats, booleans, null, lists, nested maps. +roundtrip_typed_message_test() -> + Opts = opts(), + Msg = #{ + <<"name">> => <<"alice">>, + <<"age">> => 30, + <<"score">> => 4.5, + <<"admin">> => true, + <<"parent">> => null, + <<"tags">> => [<<"a">>, <<"b">>, <<"c">>], + <<"nested">> => #{ <<"k">> => <<"v">>, <<"n">> => -42 } + }, + Bytes = hb_message:convert(Msg, <<"ipfs@1.0">>, Opts), + Decoded = + hb_message:convert( + Bytes, + <<"structured@1.0">>, + <<"ipfs@1.0">>, + Opts + ), + ?assert(hb_message:match(Msg, Decoded, strict, Opts)). + +%% @doc Encoding is deterministic: two differently-ordered source maps +%% produce the same bytes, and re-encoding is stable. +encoding_is_deterministic_test() -> + Opts = opts(), + Encode = + fun(M) -> + hb_message:convert(M, <<"ipfs@1.0">>, Opts) + end, + M1 = #{ <<"a">> => 1, <<"bb">> => 2, <<"ccc">> => 3 }, + M2 = #{ <<"ccc">> => 3, <<"a">> => 1, <<"bb">> => 2 }, + ?assertEqual(Encode(M1), Encode(M2)), + ?assertEqual(Encode(M1), Encode(M1)). + +%% @doc Committing the dag-cbor bytes of a message yields a CIDv1 +%% identical to the one `ipfs dag put --input-codec dag-cbor' would produce. +cid_matches_dag_cbor_of_message_test() -> + Opts = opts(), + Bytes = + hb_message:convert( + #{ <<"hello">> => <<"world">> }, + <<"ipfs@1.0">>, + Opts + ), + Committed = + ipfs_commit( + #{ <<"body">> => Bytes }, + Opts, + #{ <<"hash-alg">> => <<"sha2-256-dag-cbor">> } + ), + [CID] = maps:keys(maps:get(<<"commitments">>, Committed)), + {ok, Parts} = dev_codec_ipfs_cid:decode(CID), + ?assertEqual(<<"sha2-256-dag-cbor">>, maps:get(<<"hash-alg">>, Parts)), + ?assertEqual(crypto:hash(sha256, Bytes), maps:get(<<"digest">>, Parts)), + ?assertMatch(<<"bafyrei", _:52/binary>>, CID). + +%% @doc Atoms outside `null/true/false' have no dag-cbor representation. +unsupported_atom_rejected_test() -> + ?assertMatch( + {error, {dag_cbor_encode, {unsupported_atom, something}}}, + dev_codec_ipfs:to(#{ <<"kind">> => something }, #{}, opts()) + ). + +%% @doc Local end-to-end (no network): encode a rich message, commit its +%% CID, write, read back by CID, decode. Exercises the whole codec + +%% commit + cache path with no mocks. +local_end_to_end_encode_commit_cache_decode_test() -> + Opts = opts(), + Msg = #{ + <<"kind">> => <<"greeting">>, + <<"from">> => <<"alice">>, + <<"to">> => <<"bob">>, + <<"count">> => 3, + <<"active">> => true + }, + Bytes = hb_message:convert(Msg, <<"ipfs@1.0">>, Opts), + Committed = + ipfs_commit( + #{ <<"body">> => Bytes }, + Opts, + #{ <<"hash-alg">> => <<"sha2-256-dag-cbor">> } + ), + [CID] = maps:keys(maps:get(<<"commitments">>, Committed)), + {ok, _} = hb_cache:write(Committed, Opts), + {ok, Fetched} = hb_cache:read(CID, Opts), + FetchedBytes = + hb_cache:ensure_loaded(maps:get(<<"body">>, Fetched), Opts), + ?assertEqual(Bytes, FetchedBytes), + ?assert( + hb_message:match( + Msg, + hb_message:convert( + FetchedBytes, + <<"structured@1.0">>, + <<"ipfs@1.0">>, + Opts + ), + strict, + Opts + ) + ). + +%% @doc A committed message roundtrips through the codec with its +%% commitments intact — matching `dev_codec_json' / `dev_codec_flat' / +%% `dev_codec_ans104'. +commit_then_encode_preserves_commitments_test() -> + Opts = opts(), + Committed = + ipfs_commit( + #{ <<"body">> => ?HELLO_WORLD, <<"kind">> => <<"greeting">> }, + Opts + ), + Bytes = hb_message:convert(Committed, <<"ipfs@1.0">>, Opts), + {ok, Ipld} = dev_codec_ipfs_cbor:decode(Bytes), + ?assert(maps:is_key(<<"commitments">>, Ipld)), + ?assert( + hb_message:match( + Committed, + hb_message:convert( + Bytes, + <<"structured@1.0">>, + <<"ipfs@1.0">>, + Opts + ), + strict, + Opts + ) + ). + +%% @doc Two different codecs of the same body give two distinct CIDs that +%% both resolve to the same cached message. +raw_and_dag_cbor_cids_coexist_test() -> + Opts = opts(), + Body = <<16#a0>>, + M1 = + ipfs_commit( + #{ <<"body">> => Body }, + Opts, + #{ <<"hash-alg">> => <<"sha2-256-raw">> } + ), + M2 = + ipfs_commit( + M1, + Opts, + #{ <<"hash-alg">> => <<"sha2-256-dag-cbor">> } + ), + ?assertEqual(2, maps:size(maps:get(<<"commitments">>, M2))), + {ok, _} = hb_cache:write(M2, Opts), + {ok, ViaDagCbor} = hb_cache:read(?EMPTY_MAP_CID, Opts), + ?assertEqual( + Body, + hb_cache:ensure_loaded(maps:get(<<"body">>, ViaDagCbor), Opts) + ). + +%%% 4. Live — real gateways, real HyperBEAM nodes + +%% @doc End-to-end against real IPFS: fetch a known pinned dag-cbor CID, +%% verify the attached commitment, decode through `from/3'. +live_end_to_end_fetch_and_decode_dag_cbor_test_() -> + {timeout, 60, fun() -> + application:ensure_all_started(inets), + application:ensure_all_started(ssl), + NodeOpts = + opts(#{ + store => + [hb_test_utils:test_store(), gateway_store()] + }), + case hb_cache:read(?EMPTY_MAP_CID, NodeOpts) of + {ok, Fetched} -> + Bytes = + hb_cache:ensure_loaded( + maps:get(<<"body">>, Fetched), + NodeOpts + ), + ?assertEqual(<<16#a0>>, Bytes), + ?assert( + hb_message:verify( + Fetched, + #{ <<"commitment-ids">> => [?EMPTY_MAP_CID] }, + NodeOpts + ) + ), + ?assertEqual( + #{}, + hb_message:convert( + Bytes, + <<"structured@1.0">>, + <<"ipfs@1.0">>, + NodeOpts + ) + ); + _ -> + ?debugFmt( + "Skipping: all gateways missed ~s", + [?EMPTY_MAP_CID] + ) + end + end}. + +%% @doc A running HyperBEAM node serves a CID via the `~lookup@1.0' path. +live_http_get_cid_serves_body_test_() -> + {timeout, 90, fun() -> with_live_gateways(fun() -> + NodeURL = hb_http_server:start_node(node_opts()), + {ok, R} = hb_http:get(NodeURL, ?LOOKUP_PATH, #{}), + ?assertEqual(?HELLO_WORLD, response_body(R)) + end) end}. + +%% @doc Recomputing the CID from the wire body must reproduce the +%% requested CID — the only verification that matters in IPFS. +live_http_body_round_trips_to_cid_test_() -> + {timeout, 90, fun() -> with_live_gateways(fun() -> + NodeURL = hb_http_server:start_node(node_opts()), + {ok, R} = hb_http:get(NodeURL, ?LOOKUP_PATH, #{}), + ?assertEqual( + ?HELLO_WORLD_CID, + dev_codec_ipfs_cid:encode( + <<"raw">>, + sha2_256, + response_body(R) + ) + ) + end) end}. + +%% @doc First lookup pulls the CID through the gateway and pins it to the +%% node's primary store; a second direct probe of the primary succeeds. +live_cache_preload_pattern_test_() -> + {timeout, 90, fun() -> with_live_gateways(fun() -> + LocalStore = #{ + <<"store-module">> => hb_store_fs, + <<"name">> => + iolist_to_binary( + [ + "cache-TEST/ipfs-preload-", + integer_to_list(erlang:system_time(microsecond)) + ] + ) + }, + hb_store:reset(LocalStore), + Stock = hb_opts:get(preloaded_devices, [], #{}), + NodeURL = hb_http_server:start_node(#{ + cache_control => <<"cache">>, + priv_wallet => hb:wallet(), + preloaded_devices => [ipfs_device() | Stock], + store => [LocalStore, gateway_store()] + }), + {ok, R1} = hb_http:get(NodeURL, ?LOOKUP_PATH, #{}), + ?assertEqual(?HELLO_WORLD, response_body(R1)), + LocalOpts = #{ store => [LocalStore] }, + {ok, R2} = hb_cache:read(?HELLO_WORLD_CID, LocalOpts), + ?assertEqual( + ?HELLO_WORLD, + hb_cache:ensure_loaded( + hb_ao:get(<<"body">>, R2, <<>>, LocalOpts), + LocalOpts + ) + ) + end) end}. + +%% @doc Transport: an IPFS commitment must arrive on the client side +%% under its CID map key, not under `h(Sig)'. This is what the `id=' +%% extension in `dev_codec_httpsig_siginfo' preserves. +live_http_ipfs_commitment_survives_transport_test_() -> + {timeout, 90, fun() -> with_live_gateways(fun() -> + NodeURL = hb_http_server:start_node(node_opts()), + ClientOpts = #{ preloaded_devices => + [ipfs_device() | hb_opts:get(preloaded_devices, [], #{})] }, + {ok, R} = hb_http:get(NodeURL, ?LOOKUP_PATH, ClientOpts), + Msg = + case R of + M when is_map(M) -> M; + B when is_binary(B) -> #{ <<"body">> => B } + end, + IpfsComms = maps:filter( + fun(_K, #{<<"commitment-device">> := <<"ipfs@1.0">>}) -> true; + (_K, _) -> false end, + maps:get(<<"commitments">>, Msg, #{})), + case maps:to_list(IpfsComms) of + [] -> ?debugFmt( + "Skipping: no IPFS commitment on response", []); + [{CID, _}] -> ?assertEqual(?HELLO_WORLD_CID, CID); + Many -> ?debugFmt("multiple ipfs commitments: ~p", [Many]) + end + end) end}. + +%% @doc Two in-process nodes, wired so a client request on Node B +%% transparently pulls through Node A: +%% +%% Node A — upstream — has ONLY `hb_store_ipfs_gateway'. Every read +%% passes through to the real IPFS network. +%% Node B — downstream — has a primary fs store plus +%% `hb_store_remote_node' pointed at Node A with `local-store' set to +%% the primary. B's cache misses fall through to A; A's responses +%% write through into B's primary on return. +%% +%% After the first query pins the body to B's primary, Node A is killed. +%% The next query on B must still succeed — served entirely from B's cache. +live_hb_to_hb_remote_store_relay_test_() -> + {timeout, 120, fun() -> with_live_gateways(fun() -> + %% Two distinct wallets — HB's server_id is derived from + %% `priv_wallet''s address, so shared wallets collapse two nodes + %% onto one listener. + Stock = hb_opts:get(preloaded_devices, [], #{}), + NodeAWallet = ar_wallet:new(), + NodeAServerID = + hb_util:human_id(ar_wallet:to_address(NodeAWallet)), + NodeAURL = hb_http_server:start_node(#{ + port => 18770, + priv_wallet => NodeAWallet, + cache_control => <<"cache">>, + preloaded_devices => [ipfs_device() | Stock], + store => [gateway_store()] + }), + NodeBPrimary = hb_test_utils:test_store(), + NodeBURL = hb_http_server:start_node(#{ + port => 18771, + priv_wallet => ar_wallet:new(), + cache_control => <<"cache">>, + preloaded_devices => [ipfs_device() | Stock], + store => [ + NodeBPrimary, + #{ + <<"store-module">> => hb_store_remote_node, + <<"node">> => NodeAURL, + <<"local-store">> => [NodeBPrimary] + } + ] + }), + %% (1) First query: B -> A -> real IPFS; cached on B's primary on + %% the return path. + {ok, R1} = hb_http:get(NodeBURL, ?LOOKUP_PATH, #{}), + ?assertEqual(?HELLO_WORLD, response_body(R1)), + %% (2) B's primary now holds the message keyed by the CID. + LocalOnly = #{ store => [NodeBPrimary] }, + {ok, MsgOnB} = hb_cache:read(?HELLO_WORLD_CID, LocalOnly), + ?assertEqual( + ?HELLO_WORLD, + hb_cache:ensure_loaded( + maps:get(<<"body">>, MsgOnB), + LocalOnly + ) + ), + ?assert( + maps:is_key( + ?HELLO_WORLD_CID, + maps:get(<<"commitments">>, MsgOnB, #{}) + ) + ), + %% (3) Kill Node A; (4) B must still serve from primary. + ok = cowboy:stop_listener(NodeAServerID), + {ok, R2} = hb_http:get(NodeBURL, ?LOOKUP_PATH, #{}), + ?assertEqual(?HELLO_WORLD, response_body(R2)) + end) end}. + +%% @doc Server-side half of the push-to-Arweave chain: read the CID and +%% re-commit as ANS-104 signed. The final POST to `~arweave@2.9/tx' +%% requires a funded wallet and a reachable bundler, out of scope for CI. +live_lookup_then_ans104_commit_test_() -> + {timeout, 90, fun() -> with_live_gateways(fun() -> + NodeURL = hb_http_server:start_node(node_opts()), + Path = <>, + {ok, R} = hb_http:get(NodeURL, Path, #{}), + ?assertEqual(?HELLO_WORLD, response_body(R)) + end) end}. + +%% @doc A Lua computation runs across IPFS-resolved data served by the +%% local node — the same node handling HTTP traffic. +live_lua_computation_over_ipfs_body_test_() -> + {timeout, 90, fun() -> with_live_gateways(fun() -> + NodeOpts = node_opts(), + NodeURL = hb_http_server:start_node(NodeOpts), + {ok, IpfsMsg} = hb_cache:read(?HELLO_WORLD_CID, NodeOpts), + Body = + hb_cache:ensure_loaded( + hb_ao:get(<<"body">>, IpfsMsg, <<>>, NodeOpts), + NodeOpts + ), + ?assertEqual(?HELLO_WORLD, Body), + Base = #{ + <<"device">> => <<"lua@5.3a">>, + <<"content-type">> => <<"application/lua">>, + <<"body">> => + <<"function byte_length(base, req)\n" + " return #base.body\n" + "end\n">>, + <<"function">> => <<"byte_length">>, + <<"parameters">> => [ #{ <<"body">> => Body } ] + }, + ?assertEqual( + byte_size(?HELLO_WORLD), + hb_ao:get(<<"byte_length">>, Base, undefined, NodeOpts) + ), + {ok, _} = hb_http:get(NodeURL, <<"/~meta@1.0/info">>, #{}) + end) end}. + +%%% 5. Message test-vector battery + +%% @doc Run the full `hb_message_test_vectors' battery against +%% `~ipfs@1.0', skipping vectors that do not apply to a content-addressed, +%% unsigned-only codec. Keeping the skip list here — rather than inside +%% the generic battery module — follows the `hb_ao_test_vectors' pattern +%% of carrying device-specific quirks on the opts entry. +suite_test_() -> + hb_test_utils:suite_with_opts( + hb_message_test_vectors:codec_test_suite([<<"ipfs@1.0">>]), + vector_opts() + ). + +vector_opts() -> + [#{ + name => ipfs, + parallel => true, + desc => <<"ipfs@1.0">>, + opts => #{ + store => hb_test_utils:test_store(), + priv_wallet => hb:wallet() + }, + skip => [ + %% Non-null/true/false atoms have no IPLD type and throw on + %% encode. + <<"Structured field atom parsing">>, + %% `~ipfs@1.0' is unsigned-only (content-addressed); the + %% node-message signing path needs a signed commitment. + <<"Sign node message">>, + %% `priv' is session-only state and is stripped by `to/3' — + %% it must never cross the content-addressed boundary. + <<"Priv survives conversion">>, + %% `{link, CID}' flattens to the CID string in phase 2. A + %% link-aware mapping through `hb_link' is the next phase. + <<"ID of linked message">> + ] + }]. diff --git a/src/hb_message_test_vectors.erl b/src/hb_message_test_vectors.erl index 54a311074..c22dd7492 100644 --- a/src/hb_message_test_vectors.erl +++ b/src/hb_message_test_vectors.erl @@ -2,6 +2,7 @@ %%% `message@1.0' encoding and commitment APIs. Additionally, this module %%% houses tests that ensure the general functioning of the `hb_message' API. -module(hb_message_test_vectors). +-export([codec_test_suite/1]). -include_lib("eunit/include/eunit.hrl"). -include("include/hb.hrl"). diff --git a/src/hb_opts.erl b/src/hb_opts.erl index baf333861..1bd496c51 100644 --- a/src/hb_opts.erl +++ b/src/hb_opts.erl @@ -192,6 +192,7 @@ default_message() -> #{<<"name">> => <<"greenzone@1.0">>, <<"module">> => dev_green_zone}, #{<<"name">> => <<"httpsig@1.0">>, <<"module">> => dev_codec_httpsig}, #{<<"name">> => <<"http-auth@1.0">>, <<"module">> => dev_codec_http_auth}, + #{<<"name">> => <<"ipfs@1.0">>, <<"module">> => dev_codec_ipfs}, #{<<"name">> => <<"hook@1.0">>, <<"module">> => dev_hook}, #{<<"name">> => <<"hyperbuddy@1.0">>, <<"module">> => dev_hyperbuddy}, #{<<"name">> => <<"copycat@1.0">>, <<"module">> => dev_copycat}, diff --git a/src/hb_store_ipfs_gateway.erl b/src/hb_store_ipfs_gateway.erl new file mode 100644 index 000000000..ca1b701da --- /dev/null +++ b/src/hb_store_ipfs_gateway.erl @@ -0,0 +1,288 @@ +%%% @doc Read-only store backend that fetches IPFS CIDs from a configured +%%% set of HTTP gateways, verifies the body hashes to the requested CID, +%%% and attaches an `~ipfs@1.0' unsigned commitment so the message remains +%%% independently verifiable via `hb_message:verify/2,3'. The CID is the +%%% authority, not the HTTPS certificate. +%%% +%%% Config entry: +%%% ``` +%%% #{ +%%% <<"store-module">> => hb_store_ipfs_gateway, +%%% <<"gateways">> => [<<"https://ipfs.io">>, ...], +%%% <<"timeout">> => 15000 +%%% } +%%% ''' +%%% Place after local stores for read-through semantics. Non-CIDv1 keys are +%%% ignored so the module is safe alongside Arweave-addressed stores. +-module(hb_store_ipfs_gateway). +-export([scope/1, type/2, read/2, resolve/2, list/2]). +-include("include/hb.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +-define(DEFAULT_GATEWAYS, [ + <<"https://ipfs.io">>, + <<"https://dweb.link">>, + <<"https://nftstorage.link">> +]). +-define(DEFAULT_TIMEOUT_MS, 15000). + +%% @doc Always remote — prefer local stores in the chain. +scope(_) -> remote. + +%% @doc CIDs are never aliased. +resolve(_, Key) -> Key. + +%% @doc IPFS at this edge of the spec has no composite structure. +type(_, Key) -> + case cid_of_key(Key) of + {ok, _, _} -> simple; + error -> not_found + end. + +%% @doc Return the keys of the wrapping message for a fetched CID. +list(StoreOpts, Key) -> + case read(StoreOpts, Key) of + {ok, Message} when is_map(Message) -> + {ok, hb_maps:keys(Message, StoreOpts)}; + Other -> Other + end. + +%% @doc Fetch the CID from one of the configured gateways, in order. A +%% digest mismatch is treated as a miss (the gateway lied) and the next +%% gateway is tried. Returns `not_found' if every gateway misses. +read(StoreOpts, Key) -> + case cid_of_key(Key) of + error -> + ?event(ipfs_gateway, {ignoring_non_cid, Key}), + not_found; + {ok, CID, Parts} -> + Gateways = + hb_maps:get(<<"gateways">>, StoreOpts, + ?DEFAULT_GATEWAYS, StoreOpts), + Timeout = + hb_maps:get(<<"timeout">>, StoreOpts, + ?DEFAULT_TIMEOUT_MS, StoreOpts), + try_gateways(Gateways, CID, Parts, Timeout) + end. + +%% @doc Parse a key into a CID and its pre-decoded parts. Accepts a bare +%% CIDv1 binary or a single-element path list; longer paths are rejected +%% (no UnixFS/IPLD path resolver yet). +cid_of_key(Key) when is_binary(Key) -> + try_parse_cid(Key); +cid_of_key([Single]) -> + try_parse_cid(Single); +cid_of_key(_) -> + error. + +try_parse_cid(CID) when is_binary(CID) -> + case dev_codec_ipfs_cid:decode(CID) of + {ok, Parts} -> {ok, CID, Parts}; + {error, _} -> error + end; +try_parse_cid(_) -> + error. + +try_gateways([], CID, _Parts, _Timeout) -> + ?event(ipfs_gateway, {all_gateways_missed, {cid, CID}}), + not_found; +try_gateways([Gateway|Rest], CID, Parts, Timeout) -> + case fetch_and_verify(Gateway, CID, Parts, Timeout) of + {ok, Body} -> + ?event(ipfs_gateway, + {fetched, {cid, CID}, {gateway, Gateway}, + {bytes, byte_size(Body)}}), + {ok, with_commitment(CID, Parts, Body)}; + digest_mismatch -> + ?event(warning, + {ipfs_gateway_digest_mismatch, + {cid, CID}, {gateway, Gateway}}), + try_gateways(Rest, CID, Parts, Timeout); + Other -> + ?event(ipfs_gateway, + {gateway_miss, {cid, CID}, + {gateway, Gateway}, {reason, Other}}), + try_gateways(Rest, CID, Parts, Timeout) + end. + +%% @doc Wrap verified bytes in a message whose `~ipfs@1.0' unsigned +%% commitment is keyed by the CID, so any downstream consumer can +%% re-verify independently. Mirrors `dev_codec_ipfs:commit/3' — signature = +%% raw digest (keeps the commitment on the httpsig wire), no keyid (no +%% key material needed for content-addressed commitments). +with_commitment(CID, + #{ <<"hash-alg">> := HashAlg, <<"digest">> := Digest }, + Body) -> + #{ + <<"body">> => Body, + <<"commitments">> => #{ + CID => #{ + <<"commitment-device">> => <<"ipfs@1.0">>, + <<"type">> => HashAlg, + <<"committed">> => [<<"body">>], + <<"signature">> => hb_util:encode(Digest) + } + } + }. + +%% @doc Fetch a single gateway; verify the body against the CID digest +%% before returning. Uses OTP `httpc' — no new dependency. +fetch_and_verify(Gateway, CID, Parts, Timeout) -> + URL = binary_to_list(<>), + Headers = [ + {"accept", "application/vnd.ipld.raw, application/octet-stream"}, + {"user-agent", "hyperbeam-ipfs/1.0"} + ], + HTTPOpts = [{timeout, Timeout}, {connect_timeout, Timeout}], + Opts = [{body_format, binary}, {full_result, true}], + case httpc:request(get, {URL, Headers}, HTTPOpts, Opts) of + {ok, {{_, 200, _}, _, Body}} when is_binary(Body) -> + case verify_digest(Parts, Body) of + true -> {ok, Body}; + false -> digest_mismatch + end; + {ok, {{_, 404, _}, _, _}} -> not_found; + {ok, {{_, Status, _}, _, _}} -> {error, {http_status, Status}}; + {error, Reason} -> {error, Reason} + end. + +%% @doc Compare a fetched body against the digest embedded in the CID. +%% All `sha2-256-*' hash-algs share the same underlying digest function. +verify_digest(#{ <<"hash-alg">> := <<"sha2-256-", _/binary>>, + <<"digest">> := Expected }, Body) -> + Expected =:= crypto:hash(sha256, Body); +verify_digest(_, _) -> + false. + +%%% Tests. See `dev_codec_ipfs_live_test' for broader end-to-end coverage. + +-define(HELLO_WORLD_CID, + <<"bafkreifzjut3te2nhyekklss27nh3k72ysco7y32koao5eei66wof36n5e">>). +-define(HELLO_WORLD_BODY, <<"hello world">>). +-define(LIVE_GATEWAYS, [ + <<"https://ipfs.io">>, + <<"https://dweb.link">>, + <<"https://nftstorage.link">>, + <<"https://4everland.io">> +]). + +live_store() -> + #{ + <<"store-module">> => hb_store_ipfs_gateway, + <<"gateways">> => ?LIVE_GATEWAYS, + <<"timeout">> => 20000 + }. + +ensure_inets() -> + application:ensure_all_started(inets), + application:ensure_all_started(ssl). + +cid_of_key_test() -> + CID = ?HELLO_WORLD_CID, + ?assertMatch({ok, CID, #{}}, cid_of_key(CID)), + ?assertMatch({ok, CID, #{}}, cid_of_key([CID])), + ?assertEqual(error, cid_of_key(<<"not-a-cid">>)), + %% Arweave IDs (43-char base64url) are not claimed here. + ?assertEqual(error, + cid_of_key(<<"BOogk_XAI3bvNWnxNxwxmvOfglZt17o4MOVAdPNZ_ew">>)), + %% Multi-part paths are out of scope. + ?assertEqual(error, cid_of_key([CID, <<"sub">>])). + +verify_digest_accepts_correct_body_test() -> + Body = <<"hello world">>, + ?assert(verify_digest(#{ + <<"hash-alg">> => <<"sha2-256-raw">>, + <<"digest">> => crypto:hash(sha256, Body) }, Body)). + +verify_digest_rejects_tampered_body_test() -> + ?assertNot(verify_digest(#{ + <<"hash-alg">> => <<"sha2-256-raw">>, + <<"digest">> => crypto:hash(sha256, <<"hello world">>) + }, <<"hello earth">>)). + +verify_digest_accepts_dag_cbor_hash_alg_test() -> + Body = <<16#a0>>, + ?assert(verify_digest(#{ + <<"hash-alg">> => <<"sha2-256-dag-cbor">>, + <<"digest">> => crypto:hash(sha256, Body) }, Body)). + +scope_is_remote_test() -> + ?assertEqual(remote, scope(#{})). + +read_ignores_non_cid_test() -> + ?assertEqual(not_found, + read(#{}, <<"BOogk_XAI3bvNWnxNxwxmvOfglZt17o4MOVAdPNZ_ew">>)). + +digest_gate_rejects_tampered_body_test() -> + {ok, Parts} = dev_codec_ipfs_cid:decode(?HELLO_WORLD_CID), + ?assert(verify_digest(Parts, ?HELLO_WORLD_BODY)), + ?assertNot(verify_digest(Parts, <<"hello earth">>)). + +%%% Live-service tests. The canonical `hello world' CID is pinned on every +%%% public gateway; listing several avoids flaking on one being down. + +live_gateway_fetches_known_cid_test_() -> + {timeout, 60, fun() -> + ensure_inets(), + case read(live_store(), ?HELLO_WORLD_CID) of + {ok, Msg} -> + ?assertEqual(?HELLO_WORLD_BODY, maps:get(<<"body">>, Msg)), + Comms = maps:get(<<"commitments">>, Msg), + ?assert(maps:is_key(?HELLO_WORLD_CID, Comms)), + C = maps:get(?HELLO_WORLD_CID, Comms), + ?assertEqual(<<"ipfs@1.0">>, + maps:get(<<"commitment-device">>, C)), + ?assertEqual(<<"sha2-256-raw">>, maps:get(<<"type">>, C)); + not_found -> + ?debugFmt("Skipping: all gateways missed ~s", + [?HELLO_WORLD_CID]) + end + end}. + +%% @doc The commitment attached by the gateway store must verify via the +%% standard `hb_message:verify/2,3' machinery. +live_gateway_attached_commitment_verifies_test_() -> + {timeout, 60, fun() -> + ensure_inets(), + case read(live_store(), ?HELLO_WORLD_CID) of + {ok, Msg} -> + ?assert(hb_message:verify( + Msg, + #{ <<"commitment-ids">> => [?HELLO_WORLD_CID] }, + #{})); + not_found -> + ?debugFmt("Skipping: all gateways missed ~s", + [?HELLO_WORLD_CID]) + end + end}. + +%% @doc A CID missing from the local store must fall through to the +%% gateway chain and return via the standard `hb_cache:read/2' path. +live_hb_cache_reads_from_gateway_test_() -> + {timeout, 60, fun() -> + ensure_inets(), + Opts = #{ store => [hb_test_utils:test_store(), live_store()] }, + case hb_cache:read(?HELLO_WORLD_CID, Opts) of + {ok, Msg} -> + ?assertEqual(?HELLO_WORLD_BODY, + hb_cache:ensure_loaded( + maps:get(<<"body">>, Msg), Opts)); + not_found -> + ?debugFmt("Skipping: all gateways missed CID", []) + end + end}. + +%% @doc A fake CIDv1 with random digest must not resolve anywhere — the +%% digest gate refuses any body a gateway might return for this path. +live_gateway_rejects_unpinned_cid_test_() -> + {timeout, 60, fun() -> + ensure_inets(), + UnpinnedCID = + dev_codec_ipfs_cid:encode( + <<"raw">>, + sha2_256, + crypto:strong_rand_bytes(64) + ), + Store = (live_store())#{ <<"timeout">> => 10000 }, + ?assertEqual(not_found, read(Store, UnpinnedCID)) + end}.