Skip to content

Commit ecc9a4a

Browse files
committed
relink.py: Use os.scandir() instead of os.walk() for efficiency.
os.scandir() caches stat information during directory traversal, so this reduces system calls. Results in a speedup of about 84% on Derecho, from ~230 seconds to ~37 (n=1 of each).
1 parent 8899dce commit ecc9a4a

5 files changed

Lines changed: 313 additions & 119 deletions

File tree

relink.py

Lines changed: 90 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,49 @@ def always(self, message, *args, **kwargs):
3434
logging.Logger.always = always
3535

3636

37-
def find_and_replace_owned_files(source_dir, target_dir, username, dry_run=False):
37+
def find_owned_files_scandir(directory, user_uid):
38+
"""
39+
Efficiently find all files owned by a specific user using os.scandir().
40+
41+
This is more efficient than os.walk() because os.scandir() caches stat
42+
information during directory traversal, reducing system calls.
43+
44+
Args:
45+
directory (str): The root directory to search.
46+
user_uid (int): The UID of the user whose files to find.
47+
48+
Yields:
49+
str: Absolute paths to files owned by the user.
50+
"""
51+
try:
52+
with os.scandir(directory) as entries:
53+
for entry in entries:
54+
try:
55+
# Check if it's a file (not following symlinks)
56+
if entry.is_file(follow_symlinks=False):
57+
# Get stat info (cached by scandir, very efficient)
58+
stat_info = entry.stat(follow_symlinks=False)
59+
60+
if stat_info.st_uid == user_uid:
61+
yield entry.path
62+
63+
# Recursively process directories (not following symlinks)
64+
elif entry.is_dir(follow_symlinks=False):
65+
yield from find_owned_files_scandir(entry.path, user_uid)
66+
67+
# Skip symlinks
68+
elif entry.is_symlink():
69+
logger.info("Skipping symlink: %s", entry.path)
70+
71+
except (OSError, PermissionError) as e:
72+
logger.debug("Error accessing %s: %s. Skipping.", entry.path, e)
73+
continue
74+
75+
except (OSError, PermissionError) as e:
76+
logger.debug("Error accessing %s: %s. Skipping.", directory, e)
77+
78+
79+
def replace_files_with_symlinks(source_dir, target_dir, username, dry_run=False):
3880
"""
3981
Finds files owned by a specific user in a source directory tree,
4082
deletes them, and replaces them with symbolic links to the same
@@ -66,70 +108,52 @@ def find_and_replace_owned_files(source_dir, target_dir, username, dry_run=False
66108
source_dir,
67109
)
68110

69-
for dirpath, _, filenames in os.walk(source_dir):
70-
for filename in filenames:
71-
file_path = os.path.join(dirpath, filename)
72-
73-
# Use os.stat().st_uid to get the file's owner UID
74-
try:
75-
if os.path.islink(file_path):
76-
logger.info("Skipping symlink: %s", file_path)
77-
continue
78-
79-
file_uid = os.stat(file_path).st_uid
80-
except FileNotFoundError:
81-
continue # Skip if file was deleted during traversal
82-
83-
if file_uid == user_uid:
84-
logger.info("Found owned file: %s", file_path)
85-
86-
# Determine the relative path and the new link's destination
87-
relative_path = os.path.relpath(file_path, source_dir)
88-
link_target = os.path.join(target_dir, relative_path)
89-
90-
# Check if the target file actually exists
91-
if not os.path.exists(link_target):
92-
logger.warning(
93-
"Warning: Corresponding file not found in '%s' "
94-
"for '%s'. Skipping.",
95-
target_dir,
96-
file_path,
97-
)
98-
continue
99-
100-
# Get the link name
101-
link_name = file_path
102-
103-
if dry_run:
104-
logger.info(
105-
"[DRY RUN] Would create symbolic link: %s -> %s",
106-
link_name,
107-
link_target,
108-
)
109-
continue
110-
111-
# Remove the original file
112-
try:
113-
os.rename(link_name, link_name + ".tmp")
114-
logger.info("Deleted original file: %s", link_name)
115-
except OSError as e:
116-
logger.error("Error deleting file %s: %s. Skipping.", link_name, e)
117-
continue
118-
119-
# Create the symbolic link, handling necessary parent directories
120-
try:
121-
# Create parent directories for the link if they don't exist
122-
os.makedirs(os.path.dirname(link_name), exist_ok=True)
123-
os.symlink(link_target, link_name)
124-
os.remove(link_name + ".tmp")
125-
logger.info(
126-
"Created symbolic link: %s -> %s", link_name, link_target
127-
)
128-
except OSError as e:
129-
os.rename(link_name + ".tmp", link_name)
130-
logger.error(
131-
"Error creating symlink for %s: %s. Skipping.", link_name, e
132-
)
111+
# Use efficient scandir-based search
112+
for file_path in find_owned_files_scandir(source_dir, user_uid):
113+
logger.info("Found owned file: %s", file_path)
114+
115+
# Determine the relative path and the new link's destination
116+
relative_path = os.path.relpath(file_path, source_dir)
117+
link_target = os.path.join(target_dir, relative_path)
118+
119+
# Check if the target file actually exists
120+
if not os.path.exists(link_target):
121+
logger.warning(
122+
"Warning: Corresponding file not found in '%s' for '%s'. Skipping.",
123+
target_dir,
124+
file_path,
125+
)
126+
continue
127+
128+
# Get the link name
129+
link_name = file_path
130+
131+
if dry_run:
132+
logger.info(
133+
"[DRY RUN] Would create symbolic link: %s -> %s",
134+
link_name,
135+
link_target,
136+
)
137+
continue
138+
139+
# Remove the original file
140+
try:
141+
os.rename(link_name, link_name + ".tmp")
142+
logger.info("Deleted original file: %s", link_name)
143+
except OSError as e:
144+
logger.error("Error deleting file %s: %s. Skipping.", link_name, e)
145+
continue
146+
147+
# Create the symbolic link, handling necessary parent directories
148+
try:
149+
# Create parent directories for the link if they don't exist
150+
os.makedirs(os.path.dirname(link_name), exist_ok=True)
151+
os.symlink(link_target, link_name)
152+
os.remove(link_name + ".tmp")
153+
logger.info("Created symbolic link: %s -> %s", link_name, link_target)
154+
except OSError as e:
155+
os.rename(link_name + ".tmp", link_name)
156+
logger.error("Error creating symlink for %s: %s. Skipping.", link_name, e)
133157

134158

135159
def validate_directory(path):
@@ -242,7 +266,7 @@ def main():
242266
start_time = time.time()
243267

244268
# --- Execution ---
245-
find_and_replace_owned_files(
269+
replace_files_with_symlinks(
246270
args.source_root, args.target_root, my_username, dry_run=args.dry_run
247271
)
248272

tests/relink/test_dryrun.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_dry_run_no_changes(dry_run_setup, caplog):
4545

4646
# Run in dry-run mode
4747
with caplog.at_level(logging.INFO):
48-
relink.find_and_replace_owned_files(
48+
relink.replace_files_with_symlinks(
4949
source_dir, target_dir, username, dry_run=True
5050
)
5151

@@ -63,7 +63,7 @@ def test_dry_run_shows_message(dry_run_setup, caplog):
6363

6464
# Run in dry-run mode
6565
with caplog.at_level(logging.INFO):
66-
relink.find_and_replace_owned_files(
66+
relink.replace_files_with_symlinks(
6767
source_dir, target_dir, username, dry_run=True
6868
)
6969

@@ -79,7 +79,7 @@ def test_dry_run_no_delete_or_create_messages(dry_run_setup, caplog):
7979

8080
# Run in dry-run mode
8181
with caplog.at_level(logging.INFO):
82-
relink.find_and_replace_owned_files(
82+
relink.replace_files_with_symlinks(
8383
source_dir, target_dir, username, dry_run=True
8484
)
8585

0 commit comments

Comments
 (0)