Skip to content
This repository was archived by the owner on Nov 9, 2017. It is now read-only.

Commit 7b64d42

Browse files
kbleesgitster
authored andcommitted
hashmap: add string interning API
Interning short strings with high probability of duplicates can reduce the memory footprint and speed up comparisons. Add strintern() and memintern() APIs that use a hashmap to manage the pool of unique, interned strings. Note: strintern(getenv()) could be used to sanitize git's use of getenv(), in case we ever encounter a platform where a call to getenv() invalidates previous getenv() results (which is allowed by POSIX). Signed-off-by: Karsten Blees <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent ab73a9d commit 7b64d42

File tree

5 files changed

+88
-0
lines changed

5 files changed

+88
-0
lines changed

Diff for: Documentation/technical/api-hashmap.txt

+15
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,21 @@ more entries.
193193
`hashmap_iter_first` is a combination of both (i.e. initializes the iterator
194194
and returns the first entry, if any).
195195

196+
`const char *strintern(const char *string)`::
197+
`const void *memintern(const void *data, size_t len)`::
198+
199+
Returns the unique, interned version of the specified string or data,
200+
similar to the `String.intern` API in Java and .NET, respectively.
201+
Interned strings remain valid for the entire lifetime of the process.
202+
+
203+
Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
204+
strings / data must not be modified or freed.
205+
+
206+
Interned strings are best used for short strings with high probability of
207+
duplicates.
208+
+
209+
Uses a hashmap to store the pool of interned strings.
210+
196211
Usage example
197212
-------------
198213

Diff for: hashmap.c

+38
Original file line numberDiff line numberDiff line change
@@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
226226
current = iter->map->table[iter->tablepos++];
227227
}
228228
}
229+
230+
struct pool_entry {
231+
struct hashmap_entry ent;
232+
size_t len;
233+
unsigned char data[FLEX_ARRAY];
234+
};
235+
236+
static int pool_entry_cmp(const struct pool_entry *e1,
237+
const struct pool_entry *e2,
238+
const unsigned char *keydata)
239+
{
240+
return e1->data != keydata &&
241+
(e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
242+
}
243+
244+
const void *memintern(const void *data, size_t len)
245+
{
246+
static struct hashmap map;
247+
struct pool_entry key, *e;
248+
249+
/* initialize string pool hashmap */
250+
if (!map.tablesize)
251+
hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);
252+
253+
/* lookup interned string in pool */
254+
hashmap_entry_init(&key, memhash(data, len));
255+
key.len = len;
256+
e = hashmap_get(&map, &key, data);
257+
if (!e) {
258+
/* not found: create it */
259+
e = xmallocz(sizeof(struct pool_entry) + len);
260+
hashmap_entry_init(e, key.ent.hash);
261+
e->len = len;
262+
memcpy(e->data, data, len);
263+
hashmap_add(&map, e);
264+
}
265+
return e->data;
266+
}

Diff for: hashmap.h

+8
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
8787
return hashmap_iter_next(iter);
8888
}
8989

90+
/* string interning */
91+
92+
extern const void *memintern(const void *data, size_t len);
93+
static inline const char *strintern(const char *string)
94+
{
95+
return memintern(string, strlen(string));
96+
}
97+
9098
#endif

Diff for: t/t0011-hashmap.sh

+13
Original file line numberDiff line numberDiff line change
@@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
237237
238238
'
239239

240+
test_expect_success 'string interning' '
241+
242+
test_hashmap "intern value1
243+
intern Value1
244+
intern value2
245+
intern value2
246+
" "value1
247+
Value1
248+
value2
249+
value2"
250+
251+
'
252+
240253
test_done

Diff for: test-hashmap.c

+14
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,20 @@ int main(int argc, char *argv[])
234234
/* print table sizes */
235235
printf("%u %u\n", map.tablesize, map.size);
236236

237+
} else if (!strcmp("intern", cmd) && l1) {
238+
239+
/* test that strintern works */
240+
const char *i1 = strintern(p1);
241+
const char *i2 = strintern(p1);
242+
if (strcmp(i1, p1))
243+
printf("strintern(%s) returns %s\n", p1, i1);
244+
else if (i1 == p1)
245+
printf("strintern(%s) returns input pointer\n", p1);
246+
else if (i1 != i2)
247+
printf("strintern(%s) != strintern(%s)", i1, i2);
248+
else
249+
printf("%s\n", i1);
250+
237251
} else if (!strcmp("perfhashmap", cmd) && l1 && l2) {
238252

239253
perf_hashmap(atoi(p1), atoi(p2));

0 commit comments

Comments
 (0)