Skip to content

Commit 65194d6

Browse files
committed
chore(git): merge from copilot/check-nlink-hardlink-deduplication
2 parents 3d4c387 + eb92bdc commit 65194d6

5 files changed

Lines changed: 104 additions & 30 deletions

File tree

src/hardlink/aware.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,9 @@ where
8282
}));
8383

8484
let ino = InodeNumber::get(stats);
85+
let dev = stats.dev();
8586
self.record
86-
.add(ino, size, links, path)
87+
.add(ino, dev, size, links, path)
8788
.map_err(ReportHardlinksError::AddToRecord)
8889
}
8990
}

src/hardlink/hardlink_list.rs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,20 @@ use pipe_trait::Pipe;
2020
#[cfg(any(unix, test))]
2121
use std::path::Path;
2222

23+
/// Internal key used to uniquely identify an inode across all filesystems.
24+
///
25+
/// Hardlinks cannot span filesystems, so including the device number prevents
26+
/// false deduplication of files from different filesystems that happen to share
27+
/// the same inode number. Both du-dust and dua-cli track `(device, inode)` pairs
28+
/// for the same reason.
29+
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
30+
struct InodeKey {
31+
/// Device number of the filesystem the inode belongs to.
32+
dev: u64,
33+
/// Inode number within the device.
34+
ino: InodeNumber,
35+
}
36+
2337
/// Map value in [`HardlinkList`].
2438
#[derive(Debug, Clone)]
2539
struct Value<Size> {
@@ -38,8 +52,8 @@ struct Value<Size> {
3852
/// [`Reflection`] which implement these traits.
3953
#[derive(Debug, SmartDefault, Clone)]
4054
pub struct HardlinkList<Size>(
41-
/// Map an inode number to its size, number of links, and detected paths.
42-
DashMap<InodeNumber, Value<Size>>,
55+
/// Map an inode key (device + inode number) to its size, number of links, and detected paths.
56+
DashMap<InodeKey, Value<Size>>,
4357
);
4458

4559
impl<Size> HardlinkList<Size> {
@@ -112,13 +126,15 @@ where
112126
pub(crate) fn add(
113127
&self,
114128
ino: InodeNumber,
129+
dev: u64,
115130
size: Size,
116131
links: u64,
117132
path: &Path,
118133
) -> Result<(), AddError<Size>> {
134+
let key = InodeKey { dev, ino };
119135
let mut assertions = Ok(());
120136
self.0
121-
.entry(ino)
137+
.entry(key)
122138
.and_modify(|recorded| {
123139
if size != recorded.size {
124140
assertions = Err(AddError::SizeConflict(SizeConflictError {

src/hardlink/hardlink_list/iter.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use super::{HardlinkList, Value};
1+
use super::{HardlinkList, InodeKey, Value};
22
use crate::{hardlink::LinkPathList, inode::InodeNumber};
33
use dashmap::{iter::Iter as DashIter, mapref::multiple::RefMulti};
44
use pipe_trait::Pipe;
@@ -7,7 +7,7 @@ use pipe_trait::Pipe;
77
#[derive(derive_more::Debug)]
88
#[debug(bound())]
99
#[debug("Iter(..)")]
10-
pub struct Iter<'a, Size>(DashIter<'a, InodeNumber, Value<Size>>);
10+
pub struct Iter<'a, Size>(DashIter<'a, InodeKey, Value<Size>>);
1111

1212
impl<Size> HardlinkList<Size> {
1313
/// Iterate over the recorded entries.
@@ -20,7 +20,7 @@ impl<Size> HardlinkList<Size> {
2020
#[derive(derive_more::Debug)]
2121
#[debug(bound())]
2222
#[debug("Item(..)")]
23-
pub struct Item<'a, Size>(RefMulti<'a, InodeNumber, Value<Size>>);
23+
pub struct Item<'a, Size>(RefMulti<'a, InodeKey, Value<Size>>);
2424

2525
impl<'a, Size> Iterator for Iter<'a, Size> {
2626
type Item = Item<'a, Size>;
@@ -33,7 +33,7 @@ impl<'a, Size> Item<'a, Size> {
3333
/// The inode number of the file.
3434
#[inline]
3535
pub fn ino(&self) -> InodeNumber {
36-
*self.0.key()
36+
self.0.key().ino
3737
}
3838

3939
/// Size of the file.

src/hardlink/hardlink_list/reflection.rs

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use super::{HardlinkList, Value};
1+
use super::{HardlinkList, InodeKey, Value};
22
use crate::{hardlink::LinkPathListReflection, inode::InodeNumber};
33
use dashmap::DashMap;
44
use derive_more::{Display, Error, Into, IntoIterator};
@@ -12,8 +12,14 @@ use serde::{Deserialize, Serialize};
1212
/// internal content.
1313
///
1414
/// **Guarantees:**
15-
/// * Every inode number is unique.
16-
/// * The internal list is always sorted by inode numbers.
15+
/// * Every `(device, inode)` pair is unique within the scope of a single scan, but inode
16+
/// numbers alone are **not** guaranteed to be unique: when scanning multiple filesystems,
17+
/// two unrelated files on different devices can share the same inode number and will each
18+
/// produce a separate entry. The reflection stores only the inode number (the JSON format
19+
/// does not carry device information), so round-tripping a multi-filesystem scan through
20+
/// JSON is an unsupported edge case.
21+
/// * The internal list is always sorted by inode numbers (and by device number as a
22+
/// tie-breaker when two entries share the same inode number).
1723
///
1824
/// **Equality:** `Reflection` implements `PartialEq` and `Eq` traits.
1925
///
@@ -95,10 +101,17 @@ impl<Size> From<Vec<ReflectionEntry<Size>>> for Reflection<Size> {
95101

96102
impl<Size> From<HardlinkList<Size>> for Reflection<Size> {
97103
fn from(HardlinkList(list): HardlinkList<Size>) -> Self {
98-
list.into_iter()
99-
.map(|(ino, value)| ReflectionEntry::new(ino, value))
104+
// Collect to a vec, sort by (ino, dev) for a stable, deterministic order, then
105+
// strip dev before wrapping. Sorting here (with dev still available) avoids the
106+
// nondeterminism that would arise from an unstable sort on ino alone when two
107+
// entries from different filesystems share the same inode number.
108+
let mut pairs: Vec<(InodeKey, Value<Size>)> = list.into_iter().collect();
109+
pairs.sort_unstable_by_key(|(key, _)| (u64::from(key.ino), key.dev));
110+
pairs
111+
.into_iter()
112+
.map(|(key, value)| ReflectionEntry::new(key.ino, value))
100113
.collect::<Vec<_>>()
101-
.pipe(Reflection::from)
114+
.pipe(Reflection)
102115
}
103116
}
104117

@@ -119,7 +132,15 @@ impl<Size> TryFrom<Reflection<Size>> for HardlinkList<Size> {
119132

120133
for entry in entries {
121134
let (ino, value) = entry.dissolve();
122-
if map.insert(ino, value).is_some() {
135+
// Device number is unknown when loading from a reflection (e.g. JSON input);
136+
// use dev=0 as a placeholder. This means that when reloading JSON output that
137+
// was produced by scanning multiple filesystems, files from different devices
138+
// sharing the same inode number cannot be distinguished and therefore cannot
139+
// all be represented. Such duplicates cause a ConversionError::DuplicatedInode
140+
// and are treated as an unsupported edge case, since the JSON format does not
141+
// carry device information.
142+
let key = InodeKey { dev: 0, ino };
143+
if map.insert(key, value).is_some() {
123144
return ino.pipe(ConversionError::DuplicatedInode).pipe(Err);
124145
}
125146
}

src/hardlink/hardlink_list/test.rs

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,22 @@ use crate::size::Bytes;
33
use pipe_trait::Pipe;
44
use pretty_assertions::{assert_eq, assert_ne};
55

6-
const TABLE: &[(u64, u64, u64, &str)] = &[
7-
(241, 3652, 1, "a"),
8-
(569, 2210, 1, "b"),
9-
(110, 2350, 3, "c"),
10-
(110, 2350, 3, "c1"),
11-
(778, 1110, 1, "d"),
12-
(274, 6060, 2, "e"),
13-
(274, 6060, 2, "e1"),
14-
(883, 4530, 1, "f"),
6+
const TABLE: &[(u64, u64, u64, u64, &str)] = &[
7+
// dev, ino, size, links, path
8+
(0, 241, 3652, 1, "a"),
9+
(0, 569, 2210, 1, "b"),
10+
(0, 110, 2350, 3, "c"),
11+
(0, 110, 2350, 3, "c1"),
12+
(0, 778, 1110, 1, "d"),
13+
(0, 274, 6060, 2, "e"),
14+
(0, 274, 6060, 2, "e1"),
15+
(0, 883, 4530, 1, "f"),
1516
];
1617

1718
fn add<const ROW: usize>(list: HardlinkList<Bytes>) -> HardlinkList<Bytes> {
1819
let values = TABLE[ROW];
19-
let (ino, size, links, path) = values;
20-
if let Err(error) = list.add(ino.into(), size.into(), links, path.as_ref()) {
20+
let (dev, ino, size, links, path) = values;
21+
if let Err(error) = list.add(ino.into(), dev, size.into(), links, path.as_ref()) {
2122
panic!("Failed to add {values:?} (index: {ROW}) to the list: {error}");
2223
}
2324
list
@@ -119,10 +120,10 @@ fn insertion_difference_cause_inequality() {
119120
#[test]
120121
fn detect_size_change() {
121122
let list = HardlinkList::<Bytes>::new();
122-
list.add(123.into(), 100.into(), 1, "a".as_ref())
123+
list.add(123.into(), 0, 100.into(), 1, "a".as_ref())
123124
.expect("add the first path");
124125
let actual = list
125-
.add(123.into(), 110.into(), 1, "b".as_ref())
126+
.add(123.into(), 0, 110.into(), 1, "b".as_ref())
126127
.expect_err("add the second path");
127128
let expected = AddError::SizeConflict(SizeConflictError {
128129
ino: 123.into(),
@@ -135,10 +136,10 @@ fn detect_size_change() {
135136
#[test]
136137
fn detect_number_of_links_change() {
137138
let list = HardlinkList::<Bytes>::new();
138-
list.add(123.into(), 100.into(), 1, "a".as_ref())
139+
list.add(123.into(), 0, 100.into(), 1, "a".as_ref())
139140
.expect("add the first path");
140141
let actual = list
141-
.add(123.into(), 100.into(), 2, "b".as_ref())
142+
.add(123.into(), 0, 100.into(), 2, "b".as_ref())
142143
.expect_err("add the second path");
143144
let expected = AddError::NumberOfLinksConflict(NumberOfLinksConflictError {
144145
ino: 123.into(),
@@ -147,3 +148,38 @@ fn detect_number_of_links_change() {
147148
});
148149
assert_eq!(actual, expected);
149150
}
151+
152+
/// Files on different devices may share the same inode number, but they are
153+
/// unrelated — hardlinks cannot span filesystem boundaries. Verify that two
154+
/// files with the same inode number but different device numbers produce
155+
/// separate entries in the list (i.e. the device number is actually used in
156+
/// the deduplication key).
157+
#[test]
158+
fn same_ino_on_different_devices_are_treated_separately() {
159+
let list = HardlinkList::<Bytes>::new();
160+
161+
// dev=1, ino=100 — first filesystem
162+
list.add(100.into(), 1, 50.into(), 2, "dev1/file_a".as_ref())
163+
.expect("add dev1/file_a");
164+
list.add(100.into(), 1, 50.into(), 2, "dev1/file_b".as_ref())
165+
.expect("add dev1/file_b (same dev+ino → same inode group)");
166+
167+
// dev=2, ino=100 — second filesystem, coincidentally same inode number
168+
list.add(100.into(), 2, 80.into(), 2, "dev2/file_c".as_ref())
169+
.expect("add dev2/file_c (different dev → separate inode group)");
170+
list.add(100.into(), 2, 80.into(), 2, "dev2/file_d".as_ref())
171+
.expect("add dev2/file_d (same dev+ino → same inode group as file_c)");
172+
173+
// Each device should produce its own entry, so the list should have 2 entries.
174+
assert_eq!(list.len(), 2, "expected one entry per (dev, ino) pair");
175+
176+
let reflection = list.into_reflection();
177+
// Both entries expose ino=100 in the reflection (device is not part of the
178+
// public JSON format), so there are still 2 entries in the vector.
179+
assert_eq!(reflection.len(), 2);
180+
181+
// Paths are grouped per (dev, ino): each group has exactly 2 paths.
182+
for entry in reflection.iter() {
183+
assert_eq!(entry.paths.len(), 2);
184+
}
185+
}

0 commit comments

Comments
 (0)