diff --git a/.gitignore b/.gitignore index d2f874a..b13f414 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ export/ output/ +photos-archive/ iphone-backup/ Messages/ *.sqlite \ No newline at end of file diff --git a/photos_archiver.py b/photos_archiver.py new file mode 100644 index 0000000..e204af8 --- /dev/null +++ b/photos_archiver.py @@ -0,0 +1,873 @@ +#!/usr/bin/env python3 + +# Archives iPhone photos from a local unencrypted backup and generates an HTML gallery. + +import argparse +import os +import shutil +import sqlite3 +from pathlib import Path +from datetime import datetime +from collections import defaultdict +import struct + +def read_exif_date(file_path): + """Simple EXIF reader to extract date information from image files.""" + try: + with open(file_path, 'rb') as f: + # Read file header to determine format + header = f.read(12) + f.seek(0) + + if header.startswith(b'\xff\xe1') and b'Exif' in header: + # JPEG with EXIF + return _read_jpeg_exif_date(f) + elif header.startswith(b'\xff\xd8'): + # JPEG - scan for EXIF segment + return _scan_jpeg_exif_date(f) + elif header[4:8] == b'ftyp': + # HEIC format - basic attempt + return _read_heic_exif_date(f) + elif header.startswith(b'\x89PNG'): + # PNG format + return _read_png_exif_date(f) + except Exception: + pass + return None + +def _read_jpeg_exif_date(f): + """Read EXIF date from JPEG file.""" + f.seek(0) + + # Find EXIF segment + while True: + marker = f.read(2) + if not marker or marker[0] != 0xff: + break + + if marker == b'\xff\xe1': # APP1 segment (EXIF) + length = struct.unpack('>H', f.read(2))[0] + exif_data = f.read(length - 2) + + if exif_data.startswith(b'Exif\x00\x00'): + return _parse_exif_data(exif_data[6:]) + else: + # Skip other segments + if marker[1] in [0xd8, 0xd9]: # SOI, EOI + continue + try: + length = struct.unpack('>H', f.read(2))[0] + f.seek(length - 2, 1) + except: + break + return None + +def _scan_jpeg_exif_date(f): + """Scan JPEG file for EXIF segment.""" + f.seek(0) + data = f.read(65536) # Read first 64KB + + # Look for EXIF marker + exif_pos = data.find(b'Exif\x00\x00') + if exif_pos > 0: + return _parse_exif_data(data[exif_pos + 6:]) + return None + +def _read_heic_exif_date(f): + """Basic HEIC EXIF reading - simplified approach.""" + f.seek(0) + # Read a larger chunk to find EXIF data + data = f.read(2 * 1024 * 1024) # 2MB should be enough for metadata + + # Look for EXIF marker in HEIC - try multiple patterns + patterns = [b'Exif\x00\x00', b'Exif\x00\x01', b'EXIF\x00\x00'] + + for pattern in patterns: + exif_pos = data.find(pattern) + if exif_pos >= 0: + # Try to parse EXIF data starting after the marker + try: + result = _parse_exif_data(data[exif_pos + len(pattern):]) + if result: + return result + except: + continue + + # Alternative: look for datetime strings directly in the file + return _scan_for_datetime_strings(data) + +def _scan_for_datetime_strings(data): + """Scan binary data for datetime strings.""" + import re + try: + # Convert to string for regex search, ignoring decode errors + text = data.decode('ascii', errors='ignore') + + # Look for datetime patterns like "2024:08:15 14:30:45" + datetime_pattern = r'20\d{2}:\d{2}:\d{2}\s+\d{2}:\d{2}:\d{2}' + matches = re.findall(datetime_pattern, text) + + if matches: + # Return the first valid datetime found + return matches[0] + except: + pass + return None + +def _read_png_exif_date(f): + """Read EXIF date from PNG file.""" + f.seek(8) # Skip PNG signature + + while True: + try: + # Read chunk length and type + length_data = f.read(4) + if len(length_data) != 4: + break + length = struct.unpack('>I', length_data)[0] + + chunk_type = f.read(4) + if len(chunk_type) != 4: + break + + if chunk_type == b'eXIf': + # PNG EXIF chunk - contains standard EXIF data + exif_data = f.read(length) + return _parse_exif_data(exif_data) + elif chunk_type == b'iTXt': + # International text chunk - might contain date + chunk_data = f.read(length) + try: + # iTXt format: keyword\0compression\0language\0translated_keyword\0text + parts = chunk_data.split(b'\0', 4) + if len(parts) >= 5: + keyword = parts[0].decode('latin-1', errors='ignore') + text = parts[4].decode('utf-8', errors='ignore') + + # Look for date-related keywords + if keyword.lower() in ['date', 'datetime', 'creation time', 'date:create', 'exif:datetime']: + # Try to parse as datetime + import re + datetime_match = re.search(r'20\d{2}[:-]\d{2}[:-]\d{2}[\sT]\d{2}:\d{2}:\d{2}', text) + if datetime_match: + date_str = datetime_match.group() + # Convert to EXIF format + date_str = date_str.replace('-', ':').replace('T', ' ') + return date_str + except: + pass + elif chunk_type == b'tEXt': + # Text chunk - might contain date + chunk_data = f.read(length) + try: + # tEXt format: keyword\0text + null_pos = chunk_data.find(b'\0') + if null_pos > 0: + keyword = chunk_data[:null_pos].decode('latin-1', errors='ignore') + text = chunk_data[null_pos+1:].decode('latin-1', errors='ignore') + + if keyword.lower() in ['date', 'creation time', 'timestamp']: + import re + datetime_match = re.search(r'20\d{2}[:-]\d{2}[:-]\d{2}[\sT]\d{2}:\d{2}:\d{2}', text) + if datetime_match: + date_str = datetime_match.group() + date_str = date_str.replace('-', ':').replace('T', ' ') + return date_str + except: + pass + else: + # Skip other chunk types + f.seek(length, 1) + + # Skip CRC + f.seek(4, 1) + + except (struct.error, OSError): + break + + return None + +def _parse_exif_data(exif_data): + """Parse EXIF data to extract date tags.""" + if len(exif_data) < 8: + return None + + try: + # Check byte order + if exif_data[:2] == b'II': + endian = '<' # Little endian + elif exif_data[:2] == b'MM': + endian = '>' # Big endian + else: + return None + + # Get IFD offset + ifd_offset = struct.unpack(endian + 'I', exif_data[4:8])[0] + + if ifd_offset >= len(exif_data): + return None + + # Read IFD entries + date_tags = { + 0x9003: 'DateTimeOriginal', # EXIF DateTimeOriginal + 0x0132: 'DateTime', # Image DateTime + 0x9004: 'DateTimeDigitized', # EXIF DateTimeDigitized + 0x0306: 'DateTime', # Additional DateTime tag + } + + # Try to find date in IFD0 + date_value = _read_ifd_dates(exif_data, ifd_offset, endian, date_tags) + if date_value: + return date_value + + # Try EXIF sub-IFD if available + exif_ifd_offset = _find_exif_ifd(exif_data, ifd_offset, endian) + if exif_ifd_offset and exif_ifd_offset < len(exif_data): + date_value = _read_ifd_dates(exif_data, exif_ifd_offset, endian, date_tags) + if date_value: + return date_value + + # Try IFD1 (thumbnail) if available + ifd1_offset = _get_next_ifd(exif_data, ifd_offset, endian) + if ifd1_offset and ifd1_offset < len(exif_data): + date_value = _read_ifd_dates(exif_data, ifd1_offset, endian, date_tags) + if date_value: + return date_value + + except Exception: + pass + return None + +def _read_ifd_dates(exif_data, ifd_offset, endian, date_tags): + """Read date tags from IFD.""" + try: + if ifd_offset + 2 >= len(exif_data): + return None + + entry_count = struct.unpack(endian + 'H', exif_data[ifd_offset:ifd_offset + 2])[0] + + for i in range(entry_count): + entry_offset = ifd_offset + 2 + (i * 12) + if entry_offset + 12 > len(exif_data): + break + + tag, tag_type, count, value_offset = struct.unpack( + endian + 'HHII', exif_data[entry_offset:entry_offset + 12] + ) + + if tag in date_tags: + # Handle ASCII string (type 2) + if tag_type == 2: + if count <= 4: + # Value stored in value_offset field + value_data = struct.pack(endian + 'I', value_offset)[:count-1] + else: + # Value stored at offset + if value_offset + count <= len(exif_data): + value_data = exif_data[value_offset:value_offset + count - 1] + else: + continue + + try: + date_str = value_data.decode('ascii') + if len(date_str) >= 19 and ':' in date_str: # "YYYY:MM:DD HH:MM:SS" + return date_str + except: + continue + + # Handle other types that might contain date strings + elif tag_type in [1, 3, 4, 5]: # BYTE, SHORT, LONG, RATIONAL + try: + if count <= 4: + # Data stored inline + raw_data = struct.pack(endian + 'I', value_offset) + else: + # Data stored at offset + if value_offset + count * 4 <= len(exif_data): + raw_data = exif_data[value_offset:value_offset + min(count * 4, 20)] + else: + continue + + # Try to decode as ASCII + try: + potential_date = raw_data.decode('ascii', errors='ignore').rstrip('\x00') + if len(potential_date) >= 19 and ':' in potential_date: + return potential_date + except: + pass + except: + continue + except Exception: + pass + return None + +def _get_next_ifd(exif_data, ifd_offset, endian): + """Get the offset of the next IFD.""" + try: + if ifd_offset + 2 >= len(exif_data): + return None + + entry_count = struct.unpack(endian + 'H', exif_data[ifd_offset:ifd_offset + 2])[0] + next_ifd_offset_pos = ifd_offset + 2 + (entry_count * 12) + + if next_ifd_offset_pos + 4 <= len(exif_data): + next_ifd_offset = struct.unpack(endian + 'I', exif_data[next_ifd_offset_pos:next_ifd_offset_pos + 4])[0] + return next_ifd_offset if next_ifd_offset > 0 else None + except Exception: + pass + return None + +def _find_exif_ifd(exif_data, ifd_offset, endian): + """Find EXIF sub-IFD offset.""" + try: + if ifd_offset + 2 >= len(exif_data): + return None + + entry_count = struct.unpack(endian + 'H', exif_data[ifd_offset:ifd_offset + 2])[0] + + for i in range(entry_count): + entry_offset = ifd_offset + 2 + (i * 12) + if entry_offset + 12 > len(exif_data): + break + + tag, tag_type, count, value_offset = struct.unpack( + endian + 'HHII', exif_data[entry_offset:entry_offset + 12] + ) + + if tag == 0x8769: # EXIF IFD tag + return value_offset + except Exception: + pass + return None + +def copy_camera_roll(backup_path: Path, output_path: Path): + manifest_db = backup_path / "Manifest.db" + if not manifest_db.exists(): + raise FileNotFoundError(f"Manifest.db not found in {backup_path}") + + conn = sqlite3.connect(manifest_db) + cursor = conn.cursor() + + # Query all files from CameraRollDomain + cursor.execute(""" + SELECT fileID, relativePath + FROM Files + WHERE domain = 'CameraRollDomain' + """) + + rows = cursor.fetchall() + print(f"Found {len(rows)} CameraRollDomain files") + + for file_id, relative_path in rows: + # FileID is stored as 40-char hex. Backup stores it as / + src = backup_path / file_id[:2] / file_id + if not src.exists(): + print(f"āš ļø Missing file: {src}") + continue + + dest = output_path / relative_path + dest.parent.mkdir(parents=True, exist_ok=True) + + if not dest.exists(): + shutil.copy2(src, dest) + print(f"āœ… Copied {relative_path}") + else: + print(f"ā© Skipped (already exists): {relative_path}") + + conn.close() + print("šŸŽ‰ Backup extraction completed.") + + +def find_display_file(original_file, metadata_dcim, thumbnails_dcim): + """Find the best display file (metadata JPG or thumbnail) for an original file.""" + base_name = original_file.stem # e.g., "IMG_1105" + + # First try to find in metadata + if metadata_dcim.exists(): + for folder in metadata_dcim.iterdir(): + if folder.is_dir(): + metadata_jpg = folder / f"{base_name}.JPG" + if metadata_jpg.exists(): + return metadata_jpg, "metadata" + + # Fallback to thumbnails - each image has its own directory named with full filename + if thumbnails_dcim.exists(): + for dcim_folder in thumbnails_dcim.iterdir(): + if dcim_folder.is_dir(): + # Look for a directory named after the full original filename + image_dir = dcim_folder / original_file.name + if image_dir.exists() and image_dir.is_dir(): + # Find the JPG file inside this directory (usually numbered like 5003.JPG) + for jpg_file in image_dir.glob("*.JPG"): + return jpg_file, "thumbnail" + + # If no display file found, use original + return original_file, "original" + +def get_all_original_files(original_dcim): + """Get all original image/video files from DCIM folders.""" + original_files = [] + + for folder in original_dcim.iterdir(): + if not folder.is_dir(): + continue + + for ext in ['.HEIC', '.JPG', '.PNG', '.MOV', '.MP4', '.JPEG']: + for file_path in folder.glob(f"*{ext}"): + original_files.append(file_path) + + return original_files + +def get_file_info(file_path): + """Get file information including size, modification time, and date taken from EXIF.""" + stat = file_path.stat() + + # Try to get date taken from EXIF data using our custom reader + date_taken = None + date_taken_obj = None + + if file_path.suffix.lower() in ['.jpg', '.jpeg', '.heic', '.png']: + try: + exif_date = read_exif_date(file_path) + if exif_date: + try: + date_taken_obj = datetime.strptime(exif_date, '%Y:%m:%d %H:%M:%S') + date_taken = date_taken_obj.strftime('%Y-%m-%d %H:%M:%S') + except ValueError: + pass + except Exception: + pass # Ignore errors reading EXIF data + + # Fallback to file modification time if no EXIF date + if not date_taken_obj: + date_taken_obj = datetime.fromtimestamp(stat.st_mtime) + date_taken = date_taken_obj.strftime('%Y-%m-%d %H:%M:%S') + + return { + 'size': stat.st_size, + 'date_taken': date_taken, + 'date_taken_obj': date_taken_obj, + 'size_mb': round(stat.st_size / (1024 * 1024), 2) + } + +def generate_gallery(photos_root: Path): + """Generate the HTML image gallery.""" + html_view = photos_root / "html_view" + + # Paths for different file types + metadata_dcim = photos_root / "Media" / "PhotoData" / "Metadata" / "DCIM" + thumbnails_dcim = photos_root / "Media" / "PhotoData" / "Thumbnails" / "V2" / "DCIM" + original_dcim = photos_root / "Media" / "DCIM" + + if not original_dcim.exists(): + print(f"āŒ Original DCIM folder not found: {original_dcim}") + return + + print(f"šŸ“ Looking for display files in:") + print(f" Metadata: {metadata_dcim.exists() and 'Found' or 'Not found'}") + print(f" Thumbnails: {thumbnails_dcim.exists() and 'Found' or 'Not found'}") + + # Get all original files + original_files = get_all_original_files(original_dcim) + print(f"Found {len(original_files)} original files") + + if not original_files: + print("āŒ No images found to generate gallery") + return + + # Collect all images + images = [] + metadata_count = 0 + thumbnail_count = 0 + original_only_count = 0 + + for original_file in original_files: + # Find the best display file + display_file, display_type = find_display_file(original_file, metadata_dcim, thumbnails_dcim) + + # Count display types + if display_type == "metadata": + metadata_count += 1 + elif display_type == "thumbnail": + thumbnail_count += 1 + else: + original_only_count += 1 + + # Get file info + original_info = get_file_info(original_file) + display_info = get_file_info(display_file) if display_file != original_file else original_info + + # Get folder name from original file path + folder_name = original_file.parent.name + + images.append({ + 'name': original_file.stem, + 'display_path': str(display_file.relative_to(photos_root)), + 'original_path': str(original_file.relative_to(photos_root)), + 'folder': folder_name, + 'display_info': display_info, + 'original_info': original_info, + 'original_ext': original_file.suffix.upper(), + 'display_type': display_type, + 'display_ext': display_file.suffix.upper() + }) + + print(f"šŸ“Š Total original files: {len(original_files)}") + print(f"šŸ“Š Using metadata for display: {metadata_count}") + print(f"šŸ“Š Using thumbnails for display: {thumbnail_count}") + print(f"šŸ“Š Using original for display: {original_only_count}") + print(f"šŸ“Š Images to display: {len(images)}") + + # Sort images by date taken (newest first), then by name + images.sort(key=lambda x: (x['original_info']['date_taken_obj'], x['name']), reverse=True) + + # Group images by date + grouped_images = defaultdict(list) + for img in images: + date_key = img['original_info']['date_taken_obj'].strftime('%Y-%m-%d') + grouped_images[date_key].append(img) + + # Generate HTML content + html_content = generate_html_content(images, grouped_images) + + # Write the HTML file + html_view.mkdir(exist_ok=True) + output_file = html_view / "index.html" + with open(output_file, 'w', encoding='utf-8') as f: + f.write(html_content) + + print(f"āœ… Gallery generated: {output_file}") + print(f"šŸ“Š {len(images)} images included") + print(f"🌐 Open {output_file} in your browser to view the gallery") + +def generate_html_content(images, grouped_images): + """Generate the HTML content for the gallery.""" + return f""" + + + + + iPhone Gallery Archive + + + +
+

šŸ“ø iPhone Camera Roll Gallery

+

Extracted from iPhone backup

+
+ {len(images)} photos • {len(set(img['folder'] for img in images))} folders +
+
+ + + +""" + +def generate_gallery_sections(grouped_images): + """Generate HTML for gallery sections grouped by date.""" + sections_html = "" + + for date_key in sorted(grouped_images.keys(), reverse=True): + date_obj = datetime.strptime(date_key, '%Y-%m-%d') + date_display = date_obj.strftime('%d.%m.%Y') + image_count = len(grouped_images[date_key]) + + sections_html += f""" +
+
+ {date_display} +
{image_count} {'photo' if image_count == 1 else 'photos'}
+
+ +
+""" + + return sections_html + + +def main(): + parser = argparse.ArgumentParser(description="Extract Camera Roll from iPhone backup and optionally generate HTML gallery") + parser.add_argument("--backup-path", required=True, type=Path, + help="Path to iPhone backup folder (with Manifest.db)") + parser.add_argument("--output-path", required=True, type=Path, + help="Path where Camera Roll should be restored") + parser.add_argument("--generate-gallery", action="store_true", + help="Generate HTML gallery after extraction") + + args = parser.parse_args() + + # Extract camera roll + copy_camera_roll(args.backup_path, args.output_path) + + # Generate gallery if requested + if args.generate_gallery: + print("\nšŸ–¼ļø Generating HTML gallery...") + generate_gallery(args.output_path) + + +if __name__ == "__main__": + main() \ No newline at end of file