-
Notifications
You must be signed in to change notification settings - Fork 2.1k
perf(backend): replace per-file sleep(480) deletion timers with a single janitor thread #7855
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 8 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
b5acc75
feat(backend): single-thread deferred-deletion scheduler
mdmohsin7 9b8180c
feat(storage): schedule_syncing_temporal_file_deletion via janitor; r…
mdmohsin7 f27ffd7
refactor(sync): use deferred-deletion janitor for segment wav cleanup
mdmohsin7 dd0a6ab
refactor(chat): use deferred-deletion janitor at all three voice-mess…
mdmohsin7 85cf7e1
test(backend): behavioral + structural coverage for the deletion janitor
mdmohsin7 3b58469
test(storage): precache semaphore assertion 2 -> 4
mdmohsin7 91c70e6
test(sync): drop executor-swap machinery obsoleted by the deletion ja…
mdmohsin7 7121d75
test(backend): register test_deferred_blob_janitor in test.sh
mdmohsin7 d0de8a3
fix(backend): restart janitor thread if killed by a BaseException
mdmohsin7 814428f
test(backend): janitor restarts after BaseException kills the thread
mdmohsin7 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| """ | ||
| Tests for the deferred-deletion janitor (utils/other/deferred_delete.py). | ||
|
|
||
| One janitor thread + a due-time heap replace the previous per-file | ||
| time.sleep(480) on storage_executor, which parked ~70% of the pool's 128 | ||
| threads as idle timers (#7531). | ||
|
|
||
| Behavioral tests exercise the real DeferredDeleter (the module has no heavy | ||
| imports). Structural tests verify the four former sleep-sites now use the | ||
| scheduler. | ||
| """ | ||
|
|
||
| import os | ||
| import threading | ||
| import time | ||
|
|
||
| from utils.other.deferred_delete import DeferredDeleter | ||
|
|
||
|
|
||
| def _read_source(rel_path): | ||
| base = os.path.join(os.path.dirname(__file__), '..', '..') | ||
| with open(os.path.join(base, rel_path), encoding='utf-8') as f: | ||
| return f.read() | ||
|
|
||
|
|
||
| def _wait_for(predicate, timeout=2.0): | ||
| deadline = time.monotonic() + timeout | ||
| while time.monotonic() < deadline: | ||
| if predicate(): | ||
| return True | ||
| time.sleep(0.01) | ||
| return predicate() | ||
|
|
||
|
|
||
| class TestDeferredDeleterBehavior: | ||
| def test_deletes_after_delay(self): | ||
| deleted = [] | ||
| d = DeferredDeleter(deleted.append) | ||
| d.schedule('a.wav', 0.05) | ||
| assert not deleted, 'must not delete before the delay elapses' | ||
| assert _wait_for(lambda: deleted == ['a.wav']) | ||
| assert d.pending_count() == 0 | ||
|
|
||
| def test_out_of_order_schedules_delete_in_due_order(self): | ||
| deleted = [] | ||
| lock = threading.Lock() | ||
|
|
||
| def record(path): | ||
| with lock: | ||
| deleted.append(path) | ||
|
|
||
| d = DeferredDeleter(record) | ||
| d.schedule('later.wav', 0.3) | ||
| d.schedule('sooner.wav', 0.05) | ||
| assert _wait_for(lambda: len(deleted) == 2) | ||
| assert deleted == ['sooner.wav', 'later.wav'] | ||
|
|
||
| def test_earlier_schedule_interrupts_long_wait(self): | ||
| """A near-term schedule arriving while the janitor waits on a far-future | ||
| item must fire on time, not after the far-future wait.""" | ||
| deleted = [] | ||
| d = DeferredDeleter(deleted.append) | ||
| d.schedule('far.wav', 30) | ||
| d.schedule('near.wav', 0.05) | ||
| assert _wait_for(lambda: 'near.wav' in deleted) | ||
| assert 'far.wav' not in deleted | ||
| assert d.pending_count() == 1 | ||
|
|
||
| def test_failing_delete_does_not_kill_janitor(self): | ||
| deleted = [] | ||
|
|
||
| def flaky(path): | ||
| if path == 'boom.wav': | ||
| raise RuntimeError('gcs down') | ||
| deleted.append(path) | ||
|
|
||
| d = DeferredDeleter(flaky) | ||
| d.schedule('boom.wav', 0.02) | ||
| d.schedule('ok.wav', 0.05) | ||
| assert _wait_for(lambda: deleted == ['ok.wav']) | ||
|
|
||
| def test_single_thread_reused_across_schedules(self): | ||
| d = DeferredDeleter(lambda path: None) | ||
| d.schedule('a.wav', 0.01) | ||
| first_thread = d._thread | ||
| assert _wait_for(lambda: d.pending_count() == 0) | ||
| d.schedule('b.wav', 0.01) | ||
| assert d._thread is first_thread | ||
| assert _wait_for(lambda: d.pending_count() == 0) | ||
|
|
||
| def test_many_pending_use_one_thread(self): | ||
| before = threading.active_count() | ||
| d = DeferredDeleter(lambda path: None) | ||
| for i in range(200): | ||
| d.schedule(f'{i}.wav', 60) | ||
| assert d.pending_count() == 200 | ||
| assert threading.active_count() <= before + 1, '200 pending deletions must cost exactly one thread' | ||
|
|
||
|
|
||
| class TestSleepPatternRemoved: | ||
| def test_no_sleep_480_remains_in_backend(self): | ||
| for rel in ('routers/sync.py', 'utils/chat.py'): | ||
| assert 'time.sleep(480)' not in _read_source(rel), f'{rel} still parks threads as deletion timers' | ||
|
|
||
| def test_sync_uses_scheduler(self): | ||
| assert 'schedule_syncing_temporal_file_deletion(path)' in _read_source('routers/sync.py') | ||
|
|
||
| def test_chat_uses_scheduler_at_all_three_sites(self): | ||
| assert _read_source('utils/chat.py').count('schedule_syncing_temporal_file_deletion(path)') == 3 | ||
|
|
||
| def test_storage_defines_scheduler_with_480_default(self): | ||
| src = _read_source('utils/other/storage.py') | ||
| assert 'SYNCING_TEMPORAL_DELETE_DELAY_SECONDS = 480' in src | ||
| assert 'def schedule_syncing_temporal_file_deletion' in src | ||
|
|
||
| def test_precache_sem_restored(self): | ||
| # 4 → 2 was load-shedding while the pool was full of sleepers (#7526) | ||
| assert '_PRECACHE_FILE_SEM = threading.BoundedSemaphore(4)' in _read_source('utils/other/storage.py') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| """Single-thread deferred-deletion scheduler. | ||
|
|
||
| Replaces the previous pattern of parking an executor thread in | ||
| time.sleep(480) per file: at sync volume that kept ~70% of | ||
| storage_executor's 128 threads asleep as ad-hoc timers, which is what | ||
| drove the pool's repeated saturation (#7531). One daemon thread and a | ||
| due-time heap handle any number of pending deletions. | ||
|
|
||
| Best-effort by design: pending deletions are lost on process death, same | ||
| as the sleeping threads were; the syncing bucket's lifecycle rule is the | ||
| backstop. | ||
| """ | ||
|
|
||
| import heapq | ||
| import logging | ||
| import threading | ||
| import time | ||
| from typing import Callable | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class DeferredDeleter: | ||
| def __init__(self, delete_fn: Callable[[str], None], name: str = 'deferred-delete-janitor'): | ||
| self._delete_fn = delete_fn | ||
| self._name = name | ||
| self._cond = threading.Condition() | ||
| self._heap = [] # (due_monotonic, seq, path) | ||
| self._seq = 0 | ||
| self._thread = None | ||
|
|
||
| def schedule(self, path: str, delay_seconds: float) -> None: | ||
| """Schedule path for deletion after delay_seconds. O(log n), never blocks.""" | ||
| with self._cond: | ||
| self._seq += 1 | ||
| heapq.heappush(self._heap, (time.monotonic() + delay_seconds, self._seq, path)) | ||
| if self._thread is None: | ||
| self._thread = threading.Thread(target=self._run, name=self._name, daemon=True) | ||
| self._thread.start() | ||
| self._cond.notify() | ||
|
|
||
| def pending_count(self) -> int: | ||
| with self._cond: | ||
| return len(self._heap) | ||
|
|
||
| def _run(self): | ||
| while True: | ||
| with self._cond: | ||
| while not self._heap: | ||
| self._cond.wait() | ||
| due, _, path = self._heap[0] | ||
| delay = due - time.monotonic() | ||
| if delay > 0: | ||
| # A schedule() for an earlier due-time re-notifies and we re-peek | ||
| self._cond.wait(timeout=delay) | ||
| continue | ||
| heapq.heappop(self._heap) | ||
| try: | ||
| self._delete_fn(path) | ||
| except Exception as e: | ||
| logger.warning('deferred delete failed for %s: %s', path, e) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_threadis set once and never cleared. If_runexits unexpectedly — e.g., via aBaseExceptionsubclass likeMemoryErrororSystemExitthat bypasses theexcept Exceptioncatch —_threadwill still point to a deadThreadobject. Every subsequentschedule()call skips theif self._thread is None:branch, items pile up in the heap, and no deletion ever fires. The lifecycle rule is the backstop, but this is a silent failure rather than a logged one. Changing the guard toif self._thread is None or not self._thread.is_alive():would restart the janitor in the rare case it dies.