<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="de">
	<id>https://lug-vs.org/index.php?action=history&amp;feed=atom&amp;title=Doppelte-Bilder-bzw-Dateien-finden-und-aufr%C3%A4umen</id>
	<title>Doppelte-Bilder-bzw-Dateien-finden-und-aufräumen - Versionsgeschichte</title>
	<link rel="self" type="application/atom+xml" href="https://lug-vs.org/index.php?action=history&amp;feed=atom&amp;title=Doppelte-Bilder-bzw-Dateien-finden-und-aufr%C3%A4umen"/>
	<link rel="alternate" type="text/html" href="https://lug-vs.org/index.php?title=Doppelte-Bilder-bzw-Dateien-finden-und-aufr%C3%A4umen&amp;action=history"/>
	<updated>2026-06-01T13:41:12Z</updated>
	<subtitle>Versionsgeschichte dieser Seite in lugvswiki</subtitle>
	<generator>MediaWiki 1.35.4</generator>
	<entry>
		<id>https://lug-vs.org/index.php?title=Doppelte-Bilder-bzw-Dateien-finden-und-aufr%C3%A4umen&amp;diff=8079&amp;oldid=prev</id>
		<title>Marc: Die Seite wurde neu angelegt: „= Doppelte-Bilder-bzw-Dateien-finden-und-aufräumen =   #!/usr/bin/env python      # Findet doppelte Bilder nach Größe und falls Exif-Daten vorhanden sind, n…“</title>
		<link rel="alternate" type="text/html" href="https://lug-vs.org/index.php?title=Doppelte-Bilder-bzw-Dateien-finden-und-aufr%C3%A4umen&amp;diff=8079&amp;oldid=prev"/>
		<updated>2023-12-30T09:37:47Z</updated>

		<summary type="html">&lt;p&gt;Die Seite wurde neu angelegt: „= Doppelte-Bilder-bzw-Dateien-finden-und-aufräumen =   #!/usr/bin/env python      # Findet doppelte Bilder nach Größe und falls Exif-Daten vorhanden sind, n…“&lt;/p&gt;
&lt;p&gt;&lt;b&gt;Neue Seite&lt;/b&gt;&lt;/p&gt;&lt;div&gt;= Doppelte-Bilder-bzw-Dateien-finden-und-aufräumen =&lt;br /&gt;
  #!/usr/bin/env python&lt;br /&gt;
  &lt;br /&gt;
  # Findet doppelte Bilder nach Größe und falls Exif-Daten vorhanden sind, nach Kamera-Modell / Aufnahmedatum.&lt;br /&gt;
  &lt;br /&gt;
  # Nehmen wir diesen Test an:&lt;br /&gt;
  &lt;br /&gt;
  # Verzeichnisse erstellen&lt;br /&gt;
  # mkdir -p find_groups/{a,b,c}&lt;br /&gt;
  # Inhalte erstellen&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/a/A&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/b/A&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/c/A&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/a/B&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/b/B&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/c/B&lt;br /&gt;
  # &lt;br /&gt;
  # echo 2 &amp;gt; find_groups/a/Z&lt;br /&gt;
  # echo 2 &amp;gt; find_groups/b/Z&lt;br /&gt;
  #&lt;br /&gt;
  # Dann werden 2 Gruppen gefunden:&lt;br /&gt;
  &lt;br /&gt;
  # GRUPPE INDEX=0&lt;br /&gt;
  # Verzeichnis 0: find_groups/a&lt;br /&gt;
  # Verzeichnis 1: find_groups/b&lt;br /&gt;
  # Verzeichnis 2: find_groups/c&lt;br /&gt;
  # Dateien A,B&lt;br /&gt;
  #&lt;br /&gt;
  # GRUPPE INDEX=1&lt;br /&gt;
  # Verzeichnis 0: find_groups/a&lt;br /&gt;
  # Verzeichnis 1: find_groups/b&lt;br /&gt;
  # Dateien Z&lt;br /&gt;
  #&lt;br /&gt;
  # Dann kann gesagt werden, dass Gruppenindex 0 aus Verzeichnis 1 gelöscht werden kann.&lt;br /&gt;
  #&lt;br /&gt;
  # Exif und sizes werden in cache.json gespeichert. Auch 1TB an Daten kann so&lt;br /&gt;
  # schnell und wiederholend verglichen werden.&lt;br /&gt;
  &lt;br /&gt;
  &lt;br /&gt;
  # finds duplicate images&lt;br /&gt;
  # by size and if exif data present by comara model / capture date&lt;br /&gt;
  #&lt;br /&gt;
  # Assume this test:&lt;br /&gt;
  # &lt;br /&gt;
  # mkdir -p find_groups/{a,b,c}&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/a/A&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/b/A&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/c/A&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/a/B&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/b/B&lt;br /&gt;
  # echo 1 &amp;gt; find_groups/c/B&lt;br /&gt;
  # &lt;br /&gt;
  # echo 2 &amp;gt; find_groups/a/Z&lt;br /&gt;
  # echo 2 &amp;gt; find_groups/b/Z&lt;br /&gt;
  #&lt;br /&gt;
  #&lt;br /&gt;
  # Then 2 groups will be found:&lt;br /&gt;
  #&lt;br /&gt;
  # GROUP INDEX=0&lt;br /&gt;
  # directory 0: find_groups/a&lt;br /&gt;
  # directory 1: find_groups/b&lt;br /&gt;
  # directory 2: find_groups/c&lt;br /&gt;
  # files A,B&lt;br /&gt;
  #&lt;br /&gt;
  # GROUP INDEX=1&lt;br /&gt;
  &lt;br /&gt;
  # directory 0: find_groups/a&lt;br /&gt;
  # directory 1: find_groups/b&lt;br /&gt;
  # files Z&lt;br /&gt;
  #&lt;br /&gt;
  # Then you can say delete group index 0 from directory 1&lt;br /&gt;
  # &lt;br /&gt;
  # Exif and sizes get cached in cache.json. Even TB of data can be maintained&lt;br /&gt;
  # this way easily.&lt;br /&gt;
  &lt;br /&gt;
  import signal&lt;br /&gt;
  import sys&lt;br /&gt;
  import exifread&lt;br /&gt;
  from datetime import datetime&lt;br /&gt;
  import json&lt;br /&gt;
  from concurrent.futures import ThreadPoolExecutor&lt;br /&gt;
  # from typing import Self&lt;br /&gt;
  from tqdm import tqdm&lt;br /&gt;
  import os&lt;br /&gt;
  import traceback&lt;br /&gt;
  import json&lt;br /&gt;
  from pathlib import Path&lt;br /&gt;
  &lt;br /&gt;
  def files(path):&lt;br /&gt;
      matches = []&lt;br /&gt;
      for root, dirnames, filenames in os.walk(path):&lt;br /&gt;
          for p in filenames:&lt;br /&gt;
              x = os.path.join(root, p)&lt;br /&gt;
              if os.path.isfile(x):&lt;br /&gt;
                  matches.append(x)&lt;br /&gt;
      return matches&lt;br /&gt;
  &lt;br /&gt;
  &lt;br /&gt;
  def get_exif_date_model(image_path):&lt;br /&gt;
      try:&lt;br /&gt;
          with open(image_path, &amp;#039;rb&amp;#039;) as image_file:&lt;br /&gt;
              exif_tags = exifread.process_file(image_file, details=False)&lt;br /&gt;
              if &amp;#039;EXIF DateTimeOriginal&amp;#039; in exif_tags:&lt;br /&gt;
                  exif_date = exif_tags[&amp;#039;EXIF DateTimeOriginal&amp;#039;]&lt;br /&gt;
                  exif_date = datetime.strptime(str(exif_date), &amp;#039;%Y:%m:%d %H:%M:%S&amp;#039;)&lt;br /&gt;
                  model = exif_tags[&amp;#039;Image Model&amp;#039;]&lt;br /&gt;
                  print([exif_date, model])&lt;br /&gt;
                  return [image_path, f&amp;quot;{exif_date}:f{model}&amp;quot;]&lt;br /&gt;
      except Exception as e:&lt;br /&gt;
          print(f&amp;quot;Error reading EXIF data: {e}&amp;quot;)&lt;br /&gt;
          traceback.print_exc()&lt;br /&gt;
      return [image_path, &amp;quot;NO-EXIF&amp;quot;]&lt;br /&gt;
  &lt;br /&gt;
  def get_size(path):&lt;br /&gt;
      return [path, os.path.getsize(path)]&lt;br /&gt;
  &lt;br /&gt;
  def fill_cache(cache, paths, f, prefix):&lt;br /&gt;
      with tqdm(total=len(paths)) as progress:&lt;br /&gt;
          with ThreadPoolExecutor() as executor:&lt;br /&gt;
              for x in executor.map(f, paths):&lt;br /&gt;
                  if x:&lt;br /&gt;
                      file, r = x&lt;br /&gt;
                      progress.update()&lt;br /&gt;
                      cache[f&amp;quot;{prefix}{file}&amp;quot;] = r&lt;br /&gt;
                  else:&lt;br /&gt;
                      pass # no jpeg&lt;br /&gt;
              executor.shutdown()&lt;br /&gt;
  &lt;br /&gt;
  &lt;br /&gt;
  class UJsonStorage:&lt;br /&gt;
  &lt;br /&gt;
      def __init__(self, file_path):&lt;br /&gt;
          self.file_path = file_path&lt;br /&gt;
          self.data = {}&lt;br /&gt;
          try:&lt;br /&gt;
              with open(file_path, &amp;#039;r&amp;#039;) as file:&lt;br /&gt;
                  self.data = json.load(file)&lt;br /&gt;
          except FileNotFoundError:&lt;br /&gt;
              pass&lt;br /&gt;
  &lt;br /&gt;
      def __enter__(self):&lt;br /&gt;
          # even ctrl-c is enough to have the file written incompletely.&lt;br /&gt;
          self.set_sigterm_handler()&lt;br /&gt;
          return self&lt;br /&gt;
  &lt;br /&gt;
      def __exit__(self, exc_type, exc_value, traceback):&lt;br /&gt;
          with open(self.file_path, &amp;#039;w&amp;#039;) as file:&lt;br /&gt;
              json.dump(self.data, file, indent=4)&lt;br /&gt;
  &lt;br /&gt;
      def get_or(self, k, f):&lt;br /&gt;
          if not k in self.data:&lt;br /&gt;
              self.data[k] = f()&lt;br /&gt;
          return self.data[k]&lt;br /&gt;
  &lt;br /&gt;
      def __getitem__(self, item):&lt;br /&gt;
          return self.data[item]&lt;br /&gt;
  &lt;br /&gt;
      def __setitem__(self, key, value):&lt;br /&gt;
          self.data[key] = value&lt;br /&gt;
  &lt;br /&gt;
      def __delitem__(self, key):&lt;br /&gt;
          del self.data[key]&lt;br /&gt;
  &lt;br /&gt;
      def set_sigterm_handler(self):&lt;br /&gt;
          &amp;#039;&amp;#039;&amp;#039;Assigns sigterm_handler for graceful shutdown during dump()&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
          def sigterm_handler(*args, **kwargs):&lt;br /&gt;
              if self.dthread is not None:&lt;br /&gt;
                  self.dthread.join()&lt;br /&gt;
              sys.exit(0)&lt;br /&gt;
          signal.signal(signal.SIGTERM, sigterm_handler)&lt;br /&gt;
  &lt;br /&gt;
  &lt;br /&gt;
  def main(directories):&lt;br /&gt;
  &lt;br /&gt;
      with UJsonStorage(&amp;quot;cache.json&amp;quot;) as cache:&lt;br /&gt;
          files_ = [y  for x in directories for y in files(x)]&lt;br /&gt;
          print(&amp;quot;files_&amp;quot;)&lt;br /&gt;
          print(files_)&lt;br /&gt;
          print(&amp;quot;e&amp;quot;)&lt;br /&gt;
  &lt;br /&gt;
          exif_missing = []&lt;br /&gt;
          size_missing = []&lt;br /&gt;
  &lt;br /&gt;
          def exif_key(file):&lt;br /&gt;
              return f&amp;quot;exif:{file}&amp;quot;&lt;br /&gt;
          def size_key(file):&lt;br /&gt;
              return f&amp;quot;size:{file}&amp;quot;&lt;br /&gt;
  &lt;br /&gt;
          for x in files_:&lt;br /&gt;
              ek = exif_key(x)&lt;br /&gt;
              sk = size_key(x)&lt;br /&gt;
              if not ek in cache.data:&lt;br /&gt;
                  exif_missing.append(x)&lt;br /&gt;
              if not sk in cache.data:&lt;br /&gt;
                  size_missing.append(x)&lt;br /&gt;
  &lt;br /&gt;
          print(&amp;#039;lese missing exif&amp;#039;)&lt;br /&gt;
          fill_cache(cache, exif_missing, get_exif_date_model, &amp;quot;exif:&amp;quot;)&lt;br /&gt;
          print(&amp;#039;lese missing size&amp;#039;)&lt;br /&gt;
          fill_cache(cache, size_missing, get_size, &amp;quot;size:&amp;quot;)&lt;br /&gt;
  &lt;br /&gt;
          def key(path):&lt;br /&gt;
              size = cache[f&amp;quot;size:{path}&amp;quot;]&lt;br /&gt;
              exif = cache[f&amp;quot;exif:{path}&amp;quot;]&lt;br /&gt;
              return f&amp;quot;{os.path.basename(path)}:{size}:{exif}&amp;quot;&lt;br /&gt;
  &lt;br /&gt;
          bydirs = {}&lt;br /&gt;
          for f in files_:&lt;br /&gt;
              p = Path(f)&lt;br /&gt;
              k = key(f)&lt;br /&gt;
              if not k in bydirs:&lt;br /&gt;
                  bydirs[k] = {&amp;quot;basename&amp;quot;: p.name, &amp;quot;directories&amp;quot;: []}&lt;br /&gt;
              bydirs[k][&amp;quot;directories&amp;quot;].append(str(p.parent))&lt;br /&gt;
  &lt;br /&gt;
          print(bydirs)&lt;br /&gt;
  &lt;br /&gt;
          groups = {}&lt;br /&gt;
  &lt;br /&gt;
          for k, v in bydirs.items():&lt;br /&gt;
              v[&amp;quot;directories&amp;quot;].sort()&lt;br /&gt;
              d_str = &amp;quot;::&amp;quot;.join(v[&amp;quot;directories&amp;quot;])&lt;br /&gt;
              if not d_str in groups:&lt;br /&gt;
                  groups[d_str] = []&lt;br /&gt;
              groups[d_str].append(v[&amp;quot;basename&amp;quot;])&lt;br /&gt;
  &lt;br /&gt;
          group_list = []&lt;br /&gt;
  &lt;br /&gt;
          for k, v in groups.items():&lt;br /&gt;
              group_list.append({&lt;br /&gt;
                  &amp;quot;directory_list&amp;quot;: k.split(&amp;quot;::&amp;quot;),&lt;br /&gt;
                  &amp;quot;files&amp;quot;: v&lt;br /&gt;
                  })&lt;br /&gt;
  &lt;br /&gt;
          group_list = [x for x in group_list if len(x[&amp;quot;directory_list&amp;quot;]) &amp;gt; 1]&lt;br /&gt;
  &lt;br /&gt;
          group_list.sort(key = lambda x: len(x[&amp;quot;files&amp;quot;]))&lt;br /&gt;
  &lt;br /&gt;
          def print_dirs(g):&lt;br /&gt;
              for i, d in enumerate(g[&amp;quot;directory_list&amp;quot;]):&lt;br /&gt;
                  print(&amp;quot;%s: %s &amp;quot; % (i, d))&lt;br /&gt;
  &lt;br /&gt;
          for i, g in enumerate(group_list):&lt;br /&gt;
              print(&amp;quot;&amp;quot;)&lt;br /&gt;
              print(&amp;quot;&amp;quot;)&lt;br /&gt;
              print(&amp;quot;GROUP [INDEX=%s] === count: %s&amp;quot; % (i, len(g[&amp;quot;files&amp;quot;])))&lt;br /&gt;
              print(&amp;quot;directories:&amp;quot;)&lt;br /&gt;
              print_dirs(g)&lt;br /&gt;
              print(&amp;quot;files:&amp;quot;)&lt;br /&gt;
              print(g[&amp;quot;files&amp;quot;])&lt;br /&gt;
  &lt;br /&gt;
          while True:&lt;br /&gt;
              a = input(&amp;quot;delete group [index|Q=quit]: &amp;quot;)&lt;br /&gt;
              if (a == &amp;quot;Q&amp;quot;):&lt;br /&gt;
                  return&lt;br /&gt;
              a = int(a)&lt;br /&gt;
              group = group_list[int(a)]&lt;br /&gt;
              print_dirs(group)&lt;br /&gt;
              b = input(&amp;quot;delete files from directory [index]: &amp;quot;)&lt;br /&gt;
              idx = int(a)&lt;br /&gt;
              d = group[&amp;quot;directory_list&amp;quot;][int(b)]&lt;br /&gt;
              for f in group[&amp;quot;files&amp;quot;]:&lt;br /&gt;
                  x = os.path.join(d, f)&lt;br /&gt;
                  print(&amp;quot;del %s&amp;quot; % x)&lt;br /&gt;
                  os.unlink(x)&lt;br /&gt;
  &lt;br /&gt;
  if __name__ == &amp;quot;__main__&amp;quot;:&lt;br /&gt;
      directories = sys.argv[1:]&lt;br /&gt;
      print(directories)&lt;br /&gt;
      main(directories)&lt;/div&gt;</summary>
		<author><name>Marc</name></author>
	</entry>
</feed>