diff --git a/scripts/generate-base-files-info-json.py b/scripts/generate-base-files-info-json.py index 7d946ef6..e0ac788f 100755 --- a/scripts/generate-base-files-info-json.py +++ b/scripts/generate-base-files-info-json.py @@ -1,54 +1,301 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 + +#NOTE: most of this code written with qwen3-coder:30b + +import os import requests -from lxml import etree +from pathlib import Path import json -import sys -def get_package_info_from_upstream(distro, package_name): - if distro == 'debian': - distro_url = "https://packages.debian.org/search?keywords=" + package_name + "&searchon=names&suite=all§ion=all" - elif distro == 'ubuntu': - distro_url = "https://packages.ubuntu.com/search?keywords=" + package_name + "&searchon=names&suite=all§ion=all" +import re +import gzip +from urllib.parse import urljoin + +def get_debian_release_names(cache_dir="./debian_cache"): + """ + Get Debian release names from the README file + """ + # Create cache directory if it doesn't exist + Path(cache_dir).mkdir(exist_ok=True) + + # Build URL + readme_url = "http://deb.debian.org/debian/dists/README" + readme_path = os.path.join(cache_dir, "README") + + # Check if we already have the README file + if os.path.exists(readme_path): + print(f"Using cached README: {readme_path}") + with open(readme_path, 'r') as f: + readme_content = f.read() else: - print("invalid distro %s, quit" % distro) - sys.exit(1) - # Step 1: Fetch HTML content from the URL - response = requests.get(distro_url) - html_content = response.content # Use .content for lxml to handle byte data - # Step 2: Parse HTML with lxml - parser = etree.HTMLParser() - tree = etree.fromstring(html_content, parser) - # Step 3: Extract data - for h3 in tree.xpath('//h3'): - section_title = h3.text - ul = h3.xpath('./following-sibling::ul[1]') - debian_all_package_info = {} - if ul: - list_items = ul[0].xpath('.//li') - for li in list_items: - debian_package_info = {} - item_text = li.xpath('.//text()[not(parent::a)]') - item_class = li.get("class") - package_file_release = item_class - package_file_version = item_text[1].split(":")[0] - architectures = ["arm64", "armhf", "amd64", "riscv64", "loong64"] - arch_info = item_text[1].split(":")[1] - for arch in architectures: - if arch in arch_info: - package_filename = f"{package_name}_{package_file_version}_{arch}.deb" - debian_package_info[arch] = package_filename - debian_all_package_info[item_class] = debian_package_info - return debian_all_package_info -if len(sys.argv) < 2: - print("Usage: python parse.py ") - sys.exit(1) -package_name = sys.argv[1] -debian_info = get_package_info_from_upstream("debian", package_name) -ubuntu_info = get_package_info_from_upstream("ubuntu", package_name) -if debian_info and ubuntu_info: - all_info_result = {**debian_info, **ubuntu_info} - json_file_name = package_name + ".json" + print("Downloading README...") + response = requests.get(readme_url, timeout=30) + response.raise_for_status() + readme_content = response.text + + # Save to cache + with open(readme_path, 'w') as f: + f.write(readme_content) + + # Extract release names using regex + # Pattern: \S+, or (\S+)\s+ - matches "oldstable, or bookworm" and captures "bookworm" + release_pattern = r'\S+, or (\S+)\s+' + + releases = [] + for line in readme_content.split('\n'): + if line.strip(): + match = re.search(release_pattern, line) + if match: + release_name = match.group(1) + releases.append(f"debian/{release_name}") + print(f"Found release: {release_name}") + + return releases + +def get_debian_architectures(distro, release_name, cache_dir="./debian_cache"): + """ + Get supported architectures for a Debian release from InRelease file + """ + # Create cache directory if it doesn't exist + Path(cache_dir).mkdir(exist_ok=True) + + # Build URLs + match distro: + case 'debian': + base_url = "http://deb.debian.org/debian" + case 'ubuntu': + base_url = "http://archive.ubuntu.com/ubuntu" + inrelease_url = f"{base_url}/dists/{release_name}/InRelease" + inrelease_path = os.path.join(cache_dir, f"{release_name}_InRelease") + + # Check if we already have the file + if os.path.exists(inrelease_path): + #print(f"Using cached file: {inrelease_path}") + with open(inrelease_path, 'r') as f: + inrelease_content = f.read() + else: + #print(f"Downloading InRelease for {release_name}...") + response = requests.get(inrelease_url, timeout=30) + response.raise_for_status() + inrelease_content = response.text + + # Save to cache + with open(inrelease_path, 'w') as f: + f.write(inrelease_content) + + # Extract architectures from the InRelease file + # Look for the "Architectures:" line + architectures = [] + + # Split by lines and look for architectures + for line in inrelease_content.split('\n'): + if line.lower().startswith('architectures:'): + # Extract architectures after the colon + arch_line = line.split(':', 1)[1].strip() + architectures = [arch.strip() for arch in arch_line.split() if arch.strip()] + break + + if architectures: + print(f"Supported architectures for {release_name}: {architectures}") + if('all' in architectures): + architectures.remove('all') + return architectures + else: + print("Could not find Architectures field in InRelease file") + return [] + +def get_debian_srcpkg_architecture(distro, release_name, package_name, cache_dir="./debian_cache"): + """ + Get the synthesized package filename for a given package in a Debian release + """ + # Create cache directory if it doesn't exist + Path(cache_dir).mkdir(exist_ok=True) + + # Build URLs + match distro: + case 'debian': + base_url = "http://deb.debian.org/debian" + case 'ubuntu': + #base_url = "http://archive.ubuntu.com/ubuntu" + base_url = "http://ports.ubuntu.com/" + + sources_url = f"{base_url}/dists/{release_name}/main/source/Sources.gz" + sources_path = os.path.join(cache_dir, f"{release_name}_Sources.gz") + + # Check if we already have the Sources.gz file + if os.path.exists(sources_path): + print(f"Using cached Sources.gz: {sources_path}") + else: + print(f"Downloading Sources.gz for {release_name}...") + response = requests.get(sources_url, timeout=30) + response.raise_for_status() + + # Save to cache + with open(sources_path, 'wb') as f: + f.write(response.content) + + # Decompress and read + with gzip.open(sources_path, 'rt') as f: + sources_content = f.read() + + # Parse the Sources file to find the package + package_info = parse_sources_for_package(sources_content, package_name) + + if package_info: + return package_info['architecture'] + else: + raise FileNotFoundError(f"Package '{package_name}' not found in {distro}/{release_name} Sources.gz") + +def parse_sources_for_package(sources_content, package_name): + """ + Parse Sources.gz content to find package information + """ + # Split into individual package entries + packages = sources_content.split('\n\n') + + for package_entry in packages: + if not package_entry.strip(): + continue + + package_info = {} + for line in package_entry.split('\n'): + if ':' in line: + key, value = line.split(':', 1) + package_info[key.strip().lower()] = value.strip() + + # Check if this is our package + if package_info.get('package', '').lower() == package_name.lower(): + return package_info + + return None + +def get_debian_binary_package_filename(distro, release_name, package_name, architecture='arm64', cache_dir="./debian_cache"): + """ + Get the binary package filename for a given package in a Debian release + This is more complex because we need to parse Packages files + """ + # Create cache directory if it doesn't exist + Path(cache_dir).mkdir(exist_ok=True) + + # Build URLs for Packages file + match distro: + case 'debian': + if( architecture == 'loong64' ): + base_url = "http://ftp.ports.debian.org/debian-ports/" + else: + base_url = "http://ftp.debian.org/debian/" + case 'ubuntu': + if(re.match("(i386|amd64)", architecture)): #regex as there is amd64 and amd64v3 + base_url = "http://archive.ubuntu.com/ubuntu" + else: + base_url = "http://ports.ubuntu.com/" + packages_url = f"{base_url}/dists/{release_name}/main/binary-{architecture}/Packages.gz" + packages_path = os.path.join(cache_dir, f"{release_name}_{architecture}_Packages.gz") + + # Check if we already have the Packages.gz file + if os.path.exists(packages_path): + print(f"Using cached Packages.gz: {packages_path}") + else: + print(f"Downloading Packages.gz for {release_name} ({architecture})...") + response = requests.get(packages_url, timeout=30) + response.raise_for_status() + + # Save to cache + with open(packages_path, 'wb') as f: + f.write(response.content) + + # Decompress and read + with gzip.open(packages_path, 'rt') as f: + packages_content = f.read() + + # Parse the Packages file to find the package + package_info = parse_packages_for_package(packages_content, package_name) + + if package_info: + # Synthesize the package filename + filename = synthesize_binary_package_filename(package_info) + #print(f"Synthesized binary package filename: {filename}") + return filename + else: + print(f"Binary package '{package_name}' not found for {architecture}/Packages.gz") + return None + +def parse_packages_for_package(packages_content, package_name): + """ + Parse Packages.gz content to find package information + """ + # Split into individual package entries + packages = packages_content.split('\n\n') + + for package_entry in packages: + if not package_entry.strip(): + continue + + package_info = {} + for line in package_entry.split('\n'): + if ':' in line: + key, value = line.split(':', 1) + package_info[key.strip().lower()] = value.strip() + + # Check if this is our package + if package_info.get('package', '').lower() == package_name.lower(): + return package_info + + return None + +def synthesize_binary_package_filename(package_info): + """ + Synthesize the Debian binary package filename from package info + """ + # Extract needed fields + package = package_info.get('package', 'unknown') + version = package_info.get('version', '0.0.0') + architecture = package_info.get('architecture', 'all') + + # For binary packages, the filename format is: + # package_version_architecture.deb + filename = f"{package}_{version}_{architecture}.deb" + + return filename + +# Example usage: +if __name__ == "__main__": + releases = get_debian_release_names() + if('debian/rc-buggy' in releases): + releases.remove('debian/rc-buggy') + # FIXME: these are fetchable from changelogs.ubuntu.com/meta-release + # filter by 'Supported: 1'. + # Don't do this yet b/c jammy goes EOS Apr 2027, we don't know if we'll be ready. + # also resolute isn't in changelog as of 2025Dec03 + releases += [ 'ubuntu/jammy', 'ubuntu/noble', 'ubuntu/plucky', 'ubuntu/questing', 'ubuntu/resolute' ] + release_hash = {} + for release in releases: + distro, release = release.split('/') + packages = {} + + pkg_architecture = get_debian_srcpkg_architecture(distro, release, "base-files") + + # Get architectures from InRelease + print("\n=== Architecture List ===") + arch_list = pkg_architecture.split() + if( 'any' in arch_list ): + architectures = get_debian_architectures(distro, release) + else: + architectures = arch_list + if( release == 'sid' ): + # loong64 is hidden away in /debian-ports/ + architectures += ['loong64'] + + # Get binary package filename + #print("\n=== Binary Package ===") + # NOTE: we *cheat* here because base-files is always built for all architectures. + # this is NOT a generic method usable for all cases. for that you have to check Sources above + for architecture in architectures: + binary_filename = get_debian_binary_package_filename(distro, release, "base-files", architecture) + packages[architecture] = binary_filename + release_hash[release] = packages + + json_content = json.dumps(release_hash) + print(json_content) + json_file_name = "base-files.json" with open(json_file_name, "w") as outfile: - json.dump(all_info_result, outfile) -else: - print("failed to get package info") - sys.exit(1) + outfile.write(json_content)