Skip to content

Commit 635b66f

Browse files
committed
feat(ustring): ustring hash collision protection
The gist is that the ustring::strhash(str) function is modified to strip out the MSB from Strutil::strhash. The rep entry is filed in the ustring table based on this hash. So effectively, the computed hash is 63 bits, not 64. But rep->hashed field consists of the lower 63 bits being the computed hash, and the MSB indicates whether this is the 2nd (or more) entry in the table that had the same 63 bit hash. ustring::hash() then is modified as follows: If the MSB is 0, the computed hash is the hash. If the MSB is 1, though, we DON'T use that hash, and instead we use the pointer to the unique characters, but with the MSB set (that's an invalid address by itself). Note that the computed hashes never have MSB set, and the char*+MSB always have MSB set, so therefore ustring::hash() will never have the same value for two different ustrings. But -- please note! -- that ustring::strhash(str) and ustring(str).hash() will only match (and also be the same value on every execution) if the ustring is the first to receive that hash, which should be approximately always. Probably always, in practice. But in the very improbable case of a hash collision, one of them (the second to be turned into a ustring) will be using the alternate hash based on the character address, which is both not the same as ustring::strhash(chars), nor is it expected to be the same constant on every program execution. Signed-off-by: Larry Gritz <lg@larrygritz.com>
1 parent 7841ab5 commit 635b66f

3 files changed

Lines changed: 176 additions & 169 deletions

File tree

src/include/OpenImageIO/ustring.h

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#define OIIO_USTRING_HAS_CTR_FROM_USTRINGHASH 1
3131
#define OIIO_USTRING_HAS_STDHASH 1
3232
#define OIIO_HAS_USTRINGHASH_FORMATTER 1
33+
#define OIIO_USTRING_SAFE_HASH 1
3334

3435

3536
OIIO_NAMESPACE_3_1_BEGIN
@@ -120,6 +121,16 @@ OIIO_NAMESPACE_3_1_BEGIN
120121
/// - if you don't need to do a lot of string assignment or equality
121122
/// testing, but lots of more complex string manipulation.
122123
///
124+
/// The ustring implementation guarantees that no two ustrings return the same
125+
/// value for hash() (despite the slim probability that two strings could
126+
/// numerically hash to the same value). For the first ustring added with a
127+
/// given hash, u.hash() will be the same value as ustring::strhash(chars),
128+
/// and will deterministically be the same on every execution. In the very
129+
/// improbable case of a hash collision, subsequent ustrings with the same
130+
/// numeric hash will use an alternate hash based on the character address,
131+
/// which is both not the same as ustring::strhash(chars), nor is it expected
132+
/// to be the same constant on every program execution.
133+
123134
class OIIO_UTIL_API ustring {
124135
public:
125136
using rep_t = const char*; ///< The underlying representation type
@@ -285,11 +296,7 @@ class OIIO_UTIL_API ustring {
285296
/// Return a C++ std::string representation of a ustring.
286297
const std::string& string() const noexcept
287298
{
288-
if (m_chars) {
289-
const TableRep* rep = (const TableRep*)m_chars - 1;
290-
return rep->str;
291-
} else
292-
return empty_std_string;
299+
return m_chars ? rep()->str : empty_std_string;
293300
}
294301

295302
/// Reset to an empty string.
@@ -300,17 +307,27 @@ class OIIO_UTIL_API ustring {
300307
{
301308
if (!m_chars)
302309
return 0;
303-
const TableRep* rep = ((const TableRep*)m_chars) - 1;
304-
return rep->length;
310+
return rep()->length;
305311
}
306312

307-
/// Return a hashed version of the string
313+
/// ustring::strhash() uses Strutil::strhash but clears the MSB.
314+
static OIIO_HOSTDEVICE constexpr hash_t strhash(string_view str)
315+
{
316+
return Strutil::strhash(str) & hash_mask;
317+
}
318+
319+
/// Return a hashed version of the string. To guarantee unique hashes,
320+
/// we check if the "duplicate bit" of the hash is set. If not, then
321+
/// we just return the hash which we know is unique. But if that bit
322+
/// is set, we utilize the unique character address.
308323
hash_t hash() const noexcept
309324
{
310325
if (!m_chars)
311326
return 0;
312-
const TableRep* rep = ((const TableRep*)m_chars) - 1;
313-
return rep->hashed;
327+
hash_t h = rep()->hashed;
328+
return OIIO_LIKELY((h & duplicate_bit) == 0)
329+
? h
330+
: hash_t(m_chars) | duplicate_bit;
314331
}
315332

316333
/// Return a hashed version of the string
@@ -736,6 +753,8 @@ class OIIO_UTIL_API ustring {
736753
// if you know the rep, the chars are at (char *)(rep+1), and if you
737754
// know the chars, the rep is at ((TableRep *)chars - 1).
738755
struct TableRep {
756+
// hashed has the MSB set if and only if this is the second or
757+
// greater ustring to have the same hash.
739758
hash_t hashed; // precomputed Hash value
740759
std::string str; // String representation
741760
size_t length; // Length of the string; must be right before cap
@@ -744,10 +763,29 @@ class OIIO_UTIL_API ustring {
744763
TableRep(string_view strref, hash_t hash);
745764
~TableRep();
746765
const char* c_str() const noexcept { return (const char*)(this + 1); }
766+
constexpr bool unique_hash() const
767+
{
768+
return (hashed & duplicate_bit) == 0;
769+
}
747770
};
748771

772+
// duplicate_bit is a 1 in the MSB, which if set indicates a hash that
773+
// is a duplicate.
774+
static constexpr hash_t duplicate_bit = hash_t(1) << 63;
775+
// hash_mask is what you `&` with hashed to get the real hash (clearing
776+
// the duplicate bit).
777+
#if 1
778+
static constexpr hash_t hash_mask = ~duplicate_bit;
779+
#else
780+
// Alternate to force lots of hash collisions for testing purposes:
781+
static constexpr hash_t hash_mask = ~duplicate_bit & 0xffff;
782+
#endif
783+
bool has_unique_hash() const { return rep()->unique_hash(); }
784+
749785
private:
750786
static std::string empty_std_string;
787+
788+
const TableRep* rep() const { return ((const TableRep*)m_chars) - 1; }
751789
};
752790

753791

@@ -798,7 +836,7 @@ class OIIO_UTIL_API ustringhash {
798836
OIIO_DEVICE_CONSTEXPR explicit ustringhash(const char* str)
799837
#ifdef __CUDA_ARCH__
800838
// GPU: just compute the hash. This can be constexpr!
801-
: m_hash(Strutil::strhash(str))
839+
: m_hash(ustring::strhash(str))
802840
#else
803841
// CPU: make ustring, get its hash. Note that ustring ctr can't be
804842
// constexpr because it has to modify the internal ustring table.
@@ -810,7 +848,7 @@ class OIIO_UTIL_API ustringhash {
810848
OIIO_DEVICE_CONSTEXPR explicit ustringhash(const char* str, size_t len)
811849
#ifdef __CUDA_ARCH__
812850
// GPU: just compute the hash. This can be constexpr!
813-
: m_hash(Strutil::strhash(len, str))
851+
: m_hash(ustring::strhash(len, str))
814852
#else
815853
// CPU: make ustring, get its hash. Note that ustring ctr can't be
816854
// constexpr because it has to modify the internal ustring table.
@@ -824,7 +862,7 @@ class OIIO_UTIL_API ustringhash {
824862
OIIO_DEVICE_CONSTEXPR explicit ustringhash(string_view str)
825863
#ifdef __CUDA_ARCH__
826864
// GPU: just compute the hash. This can be constexpr!
827-
: m_hash(Strutil::strhash(str))
865+
: m_hash(ustring::strhash(str))
828866
#else
829867
// CPU: make ustring, get its hash. Note that ustring ctr can't be
830868
// constexpr because it has to modify the internal ustring table.
@@ -918,13 +956,13 @@ class OIIO_UTIL_API ustringhash {
918956
/// Test for equality with a char*.
919957
constexpr bool operator==(const char* str) const noexcept
920958
{
921-
return m_hash == Strutil::strhash(str);
959+
return m_hash == ustring::strhash(str);
922960
}
923961

924962
/// Test for inequality with a char*.
925963
constexpr bool operator!=(const char* str) const noexcept
926964
{
927-
return m_hash != Strutil::strhash(str);
965+
return m_hash != ustring::strhash(str);
928966
}
929967

930968
#ifndef __CUDA_ARCH__

0 commit comments

Comments
 (0)