diff --git a/dvc/cli/parser.py b/dvc/cli/parser.py index c92c8276c6..2a548b1abd 100644 --- a/dvc/cli/parser.py +++ b/dvc/cli/parser.py @@ -39,6 +39,7 @@ move, params, plots, + purge, queue, remote, remove, @@ -90,6 +91,7 @@ move, params, plots, + purge, queue, remote, remove, diff --git a/dvc/commands/purge.py b/dvc/commands/purge.py new file mode 100644 index 0000000000..381caf3dde --- /dev/null +++ b/dvc/commands/purge.py @@ -0,0 +1,99 @@ +import os + +from dvc.cli import formatter +from dvc.cli.command import CmdBase +from dvc.cli.utils import append_doc_link +from dvc.log import logger +from dvc.ui import ui + +logger = logger.getChild(__name__) + + +class CmdPurge(CmdBase): + def run(self): + if not self.args.dry_run: + msg = "This will permanently remove local DVC-tracked outputs " + else: + msg = "This will show what local DVC-tracked outputs would be removed " + if self.args.targets: + msg += "for the following targets:\n - " + "\n - ".join( + [os.path.abspath(t) for t in self.args.targets] + ) + else: + msg += "for the entire workspace." + + if self.args.recursive: + msg += "\nRecursive purge is enabled." + + if self.args.dry_run: + msg += "\n(dry-run: showing what would be removed, no changes)." + + logger.warning(msg) + + if ( + not self.args.force + and not self.args.dry_run + and not self.args.yes + and not ui.confirm("Are you sure you want to proceed?") + ): + return 1 + + # Call repo API + self.repo.purge( + targets=self.args.targets, + recursive=self.args.recursive, + force=self.args.force, + dry_run=self.args.dry_run, + ) + return 0 + + +def add_parser(subparsers, parent_parser): + PURGE_HELP = "Remove tracked outputs and their cache." + PURGE_DESCRIPTION = ( + "Removes cache objects and workspace copies of DVC-tracked outputs.\n" + "Metadata remains intact, and non-DVC files are untouched." + ) + purge_parser = subparsers.add_parser( + "purge", + parents=[parent_parser], + description=append_doc_link(PURGE_DESCRIPTION, "purge"), + help=PURGE_HELP, + formatter_class=formatter.RawDescriptionHelpFormatter, + ) + + purge_parser.add_argument( + "targets", + nargs="*", + help="Optional list of files/directories to purge (default: entire repo).", + ) + purge_parser.add_argument( + "-r", + "--recursive", + action="store_true", + default=False, + help="Recursively purge directories.", + ) + purge_parser.add_argument( + "--dry-run", + dest="dry_run", + action="store_true", + default=False, + help="Only print what would be removed without actually removing.", + ) + purge_parser.add_argument( + "-f", + "--force", + action="store_true", + default=False, + help="Force purge, bypassing safety checks and prompts.", + ) + purge_parser.add_argument( + "-y", + "--yes", + action="store_true", + default=False, + help="Do not prompt for confirmation (respects safety checks).", + ) + + purge_parser.set_defaults(func=CmdPurge) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index ba850bb6ee..bc40de962c 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -82,6 +82,7 @@ class Repo: from dvc.repo.ls_url import ls_url as _ls_url # type: ignore[misc] from dvc.repo.move import move # type: ignore[misc] from dvc.repo.pull import pull # type: ignore[misc] + from dvc.repo.purge import purge # type: ignore[misc] from dvc.repo.push import push # type: ignore[misc] from dvc.repo.remove import remove # type: ignore[misc] from dvc.repo.reproduce import reproduce # type: ignore[misc] diff --git a/dvc/repo/purge.py b/dvc/repo/purge.py new file mode 100644 index 0000000000..9370d2dfd5 --- /dev/null +++ b/dvc/repo/purge.py @@ -0,0 +1,161 @@ +from typing import TYPE_CHECKING, Optional + +from dvc.config import NoRemoteError, RemoteNotFoundError +from dvc.exceptions import DvcException +from dvc.log import logger + +from . import locked + +if TYPE_CHECKING: + from dvc.output import Output + from dvc.repo import Repo + +logger = logger.getChild(__name__) + + +class PurgeError(DvcException): + """Raised when purge fails due to safety or internal errors.""" + + +def _flatten_stages_or_outs(items) -> list["Output"]: + """Normalize collect() results into a flat list of Output objects.""" + outs = [] + for item in items: + if isinstance(item, list): + outs.extend(_flatten_stages_or_outs(item)) + elif hasattr(item, "outs"): # Stage + outs.extend(item.outs) + elif hasattr(item, "use_cache"): # Already an Output + outs.append(item) + else: + logger.debug("Skipping non-stage item in collect(): %r", item) + return outs + + +def _check_dirty(outs, force: bool) -> None: + dirty = [o for o in outs if o.use_cache and o.changed()] + if dirty and not force: + raise PurgeError( + "Some tracked outputs have uncommitted changes. " + "Use `--force` to purge anyway.\n - " + + "\n - ".join(str(o) for o in dirty) + ) + + +def _get_remote_odb(repo: "Repo"): + try: + return repo.cloud.get_remote_odb(None) + except (RemoteNotFoundError, NoRemoteError): + return None + + +def _check_remote_backup(repo: "Repo", outs, force: bool) -> None: + remote_odb = _get_remote_odb(repo) + + if not remote_odb: + if not force: + raise PurgeError( + "No default remote configured. " + "Cannot safely purge outputs without verifying remote backup.\n" + "Use `--force` to purge anyway." + ) + logger.warning( + "No default remote configured. Proceeding with purge due to --force. " + "Outputs may be permanently lost." + ) + return + + # remote exists, check objects + not_in_remote = [ + str(o) + for o in outs + if o.use_cache + and o.hash_info + and o.hash_info.value + and not remote_odb.exists(o.hash_info.value) + ] + if not_in_remote and not force: + raise PurgeError( + "Some outputs are not present in the remote cache and would be " + "permanently lost if purged:\n - " + + "\n - ".join(not_in_remote) + + "\nUse `--force` to purge anyway." + ) + if not_in_remote and force: + logger.warning( + "Some outputs are not present in the remote cache and may be " + "permanently lost:\n - %s", + "\n - ".join(not_in_remote), + ) + + +def _remove_outs(outs, dry_run: bool) -> int: + removed = 0 + for out in outs: + if dry_run: + logger.info("[dry-run] Would remove %s", out) + continue + + try: + # remove workspace file + if out.exists: + out.remove(ignore_remove=False) + + # remove cache entry + if out.use_cache and out.hash_info: + cache_path = out.cache.oid_to_path(out.hash_info.value) + if out.cache.fs.exists(cache_path): + out.cache.fs.remove(cache_path, recursive=True) + + removed += 1 + except Exception: + logger.exception("Failed to remove %s", out) + return removed + + +@locked +def purge( + self: "Repo", + targets: Optional[list[str]] = None, + recursive: bool = False, + force: bool = False, + dry_run: bool = False, +) -> int: + """ + Purge removes local copies of DVC-tracked outputs and their cache. + + - Collects outs from .dvc files and dvc.yaml. + - Ensures safety (no dirty outs unless --force). + - Ensures outputs are backed up to remote (unless --force). + - Removes both workspace copies and cache objects. + - Metadata remains intact. + """ + from dvc.repo.collect import collect + from dvc.stage.exceptions import StageFileDoesNotExistError + + try: + items = ( + collect(self, targets=targets, recursive=recursive) + if targets + else list(self.index.stages) + ) + except StageFileDoesNotExistError as e: + raise PurgeError(str(e)) from e + + outs = _flatten_stages_or_outs(items) + if not outs: + logger.info("No DVC-tracked outputs found to purge.") + return 0 + + # Run safety checks + _check_dirty(outs, force) + _check_remote_backup(self, outs, force) + + # Remove outs + removed = _remove_outs(outs, dry_run) + + if removed: + logger.info("Removed %d outputs (workspace + cache).", removed) + else: + logger.info("Nothing to purge.") + return 0 diff --git a/tests/func/test_purge.py b/tests/func/test_purge.py new file mode 100644 index 0000000000..1e6a9f8168 --- /dev/null +++ b/tests/func/test_purge.py @@ -0,0 +1,152 @@ +from pathlib import Path + +import pytest + +from dvc.cli import main +from dvc.repo.purge import PurgeError + + +def test_purge_no_remote_configured_errors(tmp_dir, dvc): + tmp_dir.dvc_gen("foo", "foo") + with pytest.raises(PurgeError): + dvc.purge() + + +def test_purge_no_remote_configured_with_force_warns(tmp_dir, dvc, caplog): + tmp_dir.dvc_gen("foo", "foo") + caplog.clear() + dvc.purge(force=True) + assert ( + "No default remote configured. Proceeding with purge due to --force" + in caplog.text + ) + + +def test_purge_api_removes_file_and_cache(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("foo", "foo") + assert (tmp_dir / "foo").exists() + assert Path(stage.outs[0].cache_path).exists() + + dvc.push("foo") # ensure remote has backup + + dvc.purge() + + # workspace file gone, cache gone, metadata remains + assert not (tmp_dir / "foo").exists() + assert not Path(stage.outs[0].cache_path).exists() + assert (tmp_dir / "foo.dvc").exists() + + +def test_purge_cli_removes_file_and_cache(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("bar", "bar") + assert (tmp_dir / "bar").exists() + assert Path(stage.outs[0].cache_path).exists() + + # force will skip check that remote has backup + assert main(["purge", "--force"]) == 0 + + assert not (tmp_dir / "bar").exists() + assert not Path(stage.outs[0].cache_path).exists() + assert (tmp_dir / "bar.dvc").exists() + + +def test_purge_targets_only(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + tmp_dir.dvc_gen({"dir": {"a.txt": "A", "b.txt": "B"}}) + assert (tmp_dir / "dir" / "a.txt").exists() + assert (tmp_dir / "dir" / "b.txt").exists() + + dvc.purge(targets=[str(tmp_dir / "dir")], force=True) + + assert not (tmp_dir / "dir").exists() + assert (tmp_dir / "dir.dvc").exists() + + +def test_purge_recursive(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + tmp_dir.dvc_gen({"nested": {"sub": {"file.txt": "content"}}}) + assert (tmp_dir / "nested" / "sub" / "file.txt").exists() + + dvc.purge(targets=["nested"], recursive=True, force=True) + assert not (tmp_dir / "nested" / "sub" / "file.txt").exists() + + +def test_purge_individual_targets(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + + # Generate two *separate* tracked files + (stage_a,) = tmp_dir.dvc_gen("a.txt", "A") + (stage_b,) = tmp_dir.dvc_gen("b.txt", "B") + + assert (tmp_dir / "a.txt").exists() + assert (tmp_dir / "b.txt").exists() + assert Path(stage_a.outs[0].cache_path).exists() + assert Path(stage_b.outs[0].cache_path).exists() + + # Push both so purge passes remote safety + dvc.push() + + # Purge only a.txt + dvc.purge(targets=[str(tmp_dir / "a.txt")]) + + # a.txt and its cache should be gone, but metadata intact + assert not (tmp_dir / "a.txt").exists() + assert not Path(stage_a.outs[0].cache_path).exists() + assert (tmp_dir / "a.txt.dvc").exists() + + # b.txt and its cache should still exist + assert (tmp_dir / "b.txt").exists() + assert Path(stage_b.outs[0].cache_path).exists() + assert (tmp_dir / "b.txt.dvc").exists() + + +def test_purge_dry_run_does_not_delete(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("baz", "baz") + cache_path = Path(stage.outs[0].cache_path) + + dvc.purge(dry_run=True, force=True) + + assert (tmp_dir / "baz").exists() + assert cache_path.exists() + + +def test_purge_dirty_file_requires_force(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + tmp_dir.dvc_gen("foo", "foo") + (tmp_dir / "foo").write_text("modified") + + with pytest.raises(PurgeError): + dvc.purge() + + dvc.purge(force=True) + assert not (tmp_dir / "foo").exists() + + +def test_purge_missing_remote_object_requires_force(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + tmp_dir.dvc_gen("foo", "foo") + dvc.push("foo") + + remote = dvc.cloud.get_remote_odb("backup") + remote.fs.remove(remote.path, recursive=True) # wipe remote + + with pytest.raises(PurgeError): + dvc.purge() + + +def test_purge_missing_remote_object_with_force_warns( + tmp_dir, dvc, make_remote, caplog +): + make_remote("backup", default=True) + tmp_dir.dvc_gen("foo", "foo") + dvc.push("foo") + + remote = dvc.cloud.get_remote_odb("backup") + remote.fs.remove(remote.path, recursive=True) # wipe remote + + caplog.clear() + dvc.purge(force=True) + assert "Some outputs are not present in the remote cache" in caplog.text diff --git a/tests/unit/command/test_purge.py b/tests/unit/command/test_purge.py new file mode 100644 index 0000000000..c7b51c5220 --- /dev/null +++ b/tests/unit/command/test_purge.py @@ -0,0 +1,76 @@ +import pytest + +from dvc.cli import parse_args +from dvc.commands.purge import CmdPurge +from dvc.repo.purge import PurgeError + + +def test_purge_args_and_call(dvc, scm, mocker): + cli_args = parse_args( + [ + "purge", + "foo", + "bar", + "--recursive", + "--dry-run", + "--force", + ] + ) + assert cli_args.func == CmdPurge + + cmd = cli_args.func(cli_args) + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + m.assert_called_once_with( + targets=["foo", "bar"], + recursive=True, + force=True, + dry_run=True, + ) + + +def test_purge_defaults(mocker): + cli_args = parse_args(["purge"]) + cmd = cli_args.func(cli_args) + + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + m.assert_called_once_with( + targets=[], + recursive=False, + force=False, + dry_run=False, + ) + + +def test_purge_safety_error(mocker): + cli_args = parse_args(["purge"]) + cmd = cli_args.func(cli_args) + + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", side_effect=PurgeError("dirty outs")) + + with pytest.raises(PurgeError): + cmd.run() + + m.assert_called_once() + + +def test_purge_yes_skips_confirm(mocker): + cli_args = parse_args(["purge", "-y"]) + cmd = cli_args.func(cli_args) + + confirm = mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + # -y should skip confirmation + confirm.assert_not_called() + m.assert_called_once()