Skip to content

Commit 65a9c4b

Browse files
Hocurilink2xt
andauthored
File deduplication (#6332)
When receiving messages, blobs will be deduplicated with the new function `create_and_deduplicate_from_bytes()`. For sending files, this adds a new function `set_file_and_deduplicate()` instead of deduplicating by default. This is for #6265; read the issue description there for more details. TODO: - [x] Set files as read-only - [x] Don't do a write when the file is already identical - [x] The first 32 chars or so of the 64-character hash are enough. I calculated that if 10b people (i.e. all of humanity) use DC, and each of them has 200k distinct blob files (I have 4k in my day-to-day account), and we used 20 chars, then the expected value for the number of name collisions would be ~0.0002 (and the probability that there is a least one name collision is lower than that) [^1]. I added 12 more characters to be on the super safe side, but this wouldn't be necessary and I could also make it 20 instead of 32. - Not 100% sure whether that's necessary at all - it would mainly be necessary if we might hit a length limit on some file systems (the blobdir is usually sth like `accounts/2ff9fc096d2f46b6832b24a1ed99c0d6/dc.db-blobs` (53 chars), plus 64 chars for the filename would be 117). - [x] "touch" the files to prevent them from being deleted - [x] TODOs in the code For later PRs: - Replace `BlobObject::create(…)` with `BlobObject::create_and_deduplicate(…)` in order to deduplicate everytime core creates a file - Modify JsonRPC to deduplicate blob files - Possibly rename BlobObject.name to BlobObject.file in order to prevent confusion (because `name` usually means "user-visible-name", not "name of the file on disk"). [^1]: Calculated with both https://printfn.github.io/fend/ and https://www.geogebra.org/calculator, both of which came to the same result ([1](https://github.com/user-attachments/assets/bbb62550-3781-48b5-88b1-ba0e29c28c0d), [2](https://github.com/user-attachments/assets/82171212-b797-4117-a39f-0e132eac7252)) --------- Co-authored-by: l <[email protected]>
1 parent 22a7cfe commit 65a9c4b

File tree

23 files changed

+582
-239
lines changed

23 files changed

+582
-239
lines changed

Cargo.lock

Lines changed: 5 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ toml = "0.8"
110110
url = "2"
111111
uuid = { version = "1", features = ["serde", "v4"] }
112112
webpki-roots = "0.26.7"
113+
blake3 = "1.5.5"
113114

114115
[dev-dependencies]
115116
anyhow = { workspace = true, features = ["backtrace"] } # Enable `backtrace` feature in tests.

deltachat-ffi/deltachat.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4756,6 +4756,31 @@ void dc_msg_set_override_sender_name(dc_msg_t* msg, const char* name)
47564756
void dc_msg_set_file (dc_msg_t* msg, const char* file, const char* filemime);
47574757

47584758

4759+
/**
4760+
* Sets the file associated with a message.
4761+
*
4762+
* If `name` is non-null, it is used as the file name
4763+
* and the actual current name of the file is ignored.
4764+
*
4765+
* If the source file is already in the blobdir, it will be renamed,
4766+
* otherwise it will be copied to the blobdir first.
4767+
*
4768+
* In order to deduplicate files that contain the same data,
4769+
* the file will be named as a hash of the file data.
4770+
*
4771+
* NOTE:
4772+
* - This function will rename the file. To get the new file path, call `get_file()`.
4773+
* - The file must not be modified after this function was called.
4774+
*
4775+
* @memberof dc_msg_t
4776+
* @param msg The message object. Must not be NULL.
4777+
* @param file The path of the file to attach. Must not be NULL.
4778+
* @param name The original filename of the attachment. If NULL, the current name of `file` will be used instead.
4779+
* @param filemime The MIME type of the file. NULL if you don't know or don't care.
4780+
*/
4781+
void dc_msg_set_file_and_deduplicate(dc_msg_t* msg, const char* file, const char* name, const char* filemime);
4782+
4783+
47594784
/**
47604785
* Set the dimensions associated with message object.
47614786
* Typically this is the width and the height of an image or video associated using dc_msg_set_file().

deltachat-ffi/src/lib.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3835,6 +3835,33 @@ pub unsafe extern "C" fn dc_msg_set_file(
38353835
)
38363836
}
38373837

3838+
#[no_mangle]
3839+
pub unsafe extern "C" fn dc_msg_set_file_and_deduplicate(
3840+
msg: *mut dc_msg_t,
3841+
file: *const libc::c_char,
3842+
name: *const libc::c_char,
3843+
filemime: *const libc::c_char,
3844+
) {
3845+
if msg.is_null() || file.is_null() {
3846+
eprintln!("ignoring careless call to dc_msg_set_file_and_deduplicate()");
3847+
return;
3848+
}
3849+
let ffi_msg = &mut *msg;
3850+
let ctx = &*ffi_msg.context;
3851+
3852+
ffi_msg
3853+
.message
3854+
.set_file_and_deduplicate(
3855+
ctx,
3856+
as_path(file),
3857+
to_opt_string_lossy(name).as_deref(),
3858+
to_opt_string_lossy(filemime).as_deref(),
3859+
)
3860+
.context("Failed to set file")
3861+
.log_err(&*ffi_msg.context)
3862+
.ok();
3863+
}
3864+
38383865
#[no_mangle]
38393866
pub unsafe extern "C" fn dc_msg_set_dimension(
38403867
msg: *mut dc_msg_t,

python/src/deltachat/message.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,9 @@ def set_html(self, html_text):
108108

109109
@props.with_doc
110110
def filename(self):
111-
"""filename if there was an attachment, otherwise empty string."""
111+
"""file path if there was an attachment, otherwise empty string.
112+
If you want to get the file extension or a user-visible string,
113+
use `basename` instead."""
112114
return from_dc_charpointer(lib.dc_msg_get_file(self._dc_msg))
113115

114116
def set_file(self, path, mime_type=None):
@@ -120,7 +122,8 @@ def set_file(self, path, mime_type=None):
120122

121123
@props.with_doc
122124
def basename(self) -> str:
123-
"""basename of the attachment if it exists, otherwise empty string."""
125+
"""The user-visible name of the attachment (incl. extension)
126+
if it exists, otherwise empty string."""
124127
# FIXME, it does not return basename
125128
return from_dc_charpointer(lib.dc_msg_get_filename(self._dc_msg))
126129

python/tests/test_1_online.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,16 @@ def send_and_receive_message():
181181
msg = send_and_receive_message()
182182
assert msg.text == "withfile"
183183
assert open(msg.filename).read() == "some data"
184-
msg.filename.index(basename)
185-
assert msg.filename.endswith(ext)
184+
msg.basename.index(basename)
185+
assert msg.basename.endswith(ext)
186186

187187
msg2 = send_and_receive_message()
188188
assert msg2.text == "withfile"
189189
assert open(msg2.filename).read() == "some data"
190-
msg2.filename.index(basename)
191-
assert msg2.filename.endswith(ext)
192-
assert msg.filename != msg2.filename
190+
msg2.basename.index(basename)
191+
assert msg2.basename.endswith(ext)
192+
assert msg.filename == msg2.filename # The file is deduplicated
193+
assert msg.basename == msg2.basename
193194

194195

195196
def test_send_file_html_attachment(tmp_path, acfactory, lp):
@@ -214,8 +215,8 @@ def test_send_file_html_attachment(tmp_path, acfactory, lp):
214215
msg = ac2.get_message_by_id(ev.data2)
215216

216217
assert open(msg.filename).read() == content
217-
msg.filename.index(basename)
218-
assert msg.filename.endswith(ext)
218+
msg.basename.index(basename)
219+
assert msg.basename.endswith(ext)
219220

220221

221222
def test_html_message(acfactory, lp):

0 commit comments

Comments
 (0)