generate-base-files-info-json.py - base-files info script rewrite to use apt repo-data

most code written with qwen3-coder:30b
This commit is contained in:
tabris
2025-12-03 18:32:07 -05:00
committed by Igor
parent 26a16ac268
commit 00d97809d4

View File

@@ -1,54 +1,301 @@
#!/usr/bin/python3
#!/usr/bin/env python3
#NOTE: most of this code written with qwen3-coder:30b
import os
import requests
from lxml import etree
from pathlib import Path
import json
import sys
def get_package_info_from_upstream(distro, package_name):
if distro == 'debian':
distro_url = "https://packages.debian.org/search?keywords=" + package_name + "&searchon=names&suite=all&section=all"
elif distro == 'ubuntu':
distro_url = "https://packages.ubuntu.com/search?keywords=" + package_name + "&searchon=names&suite=all&section=all"
import re
import gzip
from urllib.parse import urljoin
def get_debian_release_names(cache_dir="./debian_cache"):
"""
Get Debian release names from the README file
"""
# Create cache directory if it doesn't exist
Path(cache_dir).mkdir(exist_ok=True)
# Build URL
readme_url = "http://deb.debian.org/debian/dists/README"
readme_path = os.path.join(cache_dir, "README")
# Check if we already have the README file
if os.path.exists(readme_path):
print(f"Using cached README: {readme_path}")
with open(readme_path, 'r') as f:
readme_content = f.read()
else:
print("invalid distro %s, quit" % distro)
sys.exit(1)
# Step 1: Fetch HTML content from the URL
response = requests.get(distro_url)
html_content = response.content # Use .content for lxml to handle byte data
# Step 2: Parse HTML with lxml
parser = etree.HTMLParser()
tree = etree.fromstring(html_content, parser)
# Step 3: Extract data
for h3 in tree.xpath('//h3'):
section_title = h3.text
ul = h3.xpath('./following-sibling::ul[1]')
debian_all_package_info = {}
if ul:
list_items = ul[0].xpath('.//li')
for li in list_items:
debian_package_info = {}
item_text = li.xpath('.//text()[not(parent::a)]')
item_class = li.get("class")
package_file_release = item_class
package_file_version = item_text[1].split(":")[0]
architectures = ["arm64", "armhf", "amd64", "riscv64", "loong64"]
arch_info = item_text[1].split(":")[1]
for arch in architectures:
if arch in arch_info:
package_filename = f"{package_name}_{package_file_version}_{arch}.deb"
debian_package_info[arch] = package_filename
debian_all_package_info[item_class] = debian_package_info
return debian_all_package_info
if len(sys.argv) < 2:
print("Usage: python parse.py <package_name>")
sys.exit(1)
package_name = sys.argv[1]
debian_info = get_package_info_from_upstream("debian", package_name)
ubuntu_info = get_package_info_from_upstream("ubuntu", package_name)
if debian_info and ubuntu_info:
all_info_result = {**debian_info, **ubuntu_info}
json_file_name = package_name + ".json"
print("Downloading README...")
response = requests.get(readme_url, timeout=30)
response.raise_for_status()
readme_content = response.text
# Save to cache
with open(readme_path, 'w') as f:
f.write(readme_content)
# Extract release names using regex
# Pattern: \S+, or (\S+)\s+ - matches "oldstable, or bookworm" and captures "bookworm"
release_pattern = r'\S+, or (\S+)\s+'
releases = []
for line in readme_content.split('\n'):
if line.strip():
match = re.search(release_pattern, line)
if match:
release_name = match.group(1)
releases.append(f"debian/{release_name}")
print(f"Found release: {release_name}")
return releases
def get_debian_architectures(distro, release_name, cache_dir="./debian_cache"):
"""
Get supported architectures for a Debian release from InRelease file
"""
# Create cache directory if it doesn't exist
Path(cache_dir).mkdir(exist_ok=True)
# Build URLs
match distro:
case 'debian':
base_url = "http://deb.debian.org/debian"
case 'ubuntu':
base_url = "http://archive.ubuntu.com/ubuntu"
inrelease_url = f"{base_url}/dists/{release_name}/InRelease"
inrelease_path = os.path.join(cache_dir, f"{release_name}_InRelease")
# Check if we already have the file
if os.path.exists(inrelease_path):
#print(f"Using cached file: {inrelease_path}")
with open(inrelease_path, 'r') as f:
inrelease_content = f.read()
else:
#print(f"Downloading InRelease for {release_name}...")
response = requests.get(inrelease_url, timeout=30)
response.raise_for_status()
inrelease_content = response.text
# Save to cache
with open(inrelease_path, 'w') as f:
f.write(inrelease_content)
# Extract architectures from the InRelease file
# Look for the "Architectures:" line
architectures = []
# Split by lines and look for architectures
for line in inrelease_content.split('\n'):
if line.lower().startswith('architectures:'):
# Extract architectures after the colon
arch_line = line.split(':', 1)[1].strip()
architectures = [arch.strip() for arch in arch_line.split() if arch.strip()]
break
if architectures:
print(f"Supported architectures for {release_name}: {architectures}")
if('all' in architectures):
architectures.remove('all')
return architectures
else:
print("Could not find Architectures field in InRelease file")
return []
def get_debian_srcpkg_architecture(distro, release_name, package_name, cache_dir="./debian_cache"):
"""
Get the synthesized package filename for a given package in a Debian release
"""
# Create cache directory if it doesn't exist
Path(cache_dir).mkdir(exist_ok=True)
# Build URLs
match distro:
case 'debian':
base_url = "http://deb.debian.org/debian"
case 'ubuntu':
#base_url = "http://archive.ubuntu.com/ubuntu"
base_url = "http://ports.ubuntu.com/"
sources_url = f"{base_url}/dists/{release_name}/main/source/Sources.gz"
sources_path = os.path.join(cache_dir, f"{release_name}_Sources.gz")
# Check if we already have the Sources.gz file
if os.path.exists(sources_path):
print(f"Using cached Sources.gz: {sources_path}")
else:
print(f"Downloading Sources.gz for {release_name}...")
response = requests.get(sources_url, timeout=30)
response.raise_for_status()
# Save to cache
with open(sources_path, 'wb') as f:
f.write(response.content)
# Decompress and read
with gzip.open(sources_path, 'rt') as f:
sources_content = f.read()
# Parse the Sources file to find the package
package_info = parse_sources_for_package(sources_content, package_name)
if package_info:
return package_info['architecture']
else:
raise FileNotFoundError(f"Package '{package_name}' not found in {distro}/{release_name} Sources.gz")
def parse_sources_for_package(sources_content, package_name):
"""
Parse Sources.gz content to find package information
"""
# Split into individual package entries
packages = sources_content.split('\n\n')
for package_entry in packages:
if not package_entry.strip():
continue
package_info = {}
for line in package_entry.split('\n'):
if ':' in line:
key, value = line.split(':', 1)
package_info[key.strip().lower()] = value.strip()
# Check if this is our package
if package_info.get('package', '').lower() == package_name.lower():
return package_info
return None
def get_debian_binary_package_filename(distro, release_name, package_name, architecture='arm64', cache_dir="./debian_cache"):
"""
Get the binary package filename for a given package in a Debian release
This is more complex because we need to parse Packages files
"""
# Create cache directory if it doesn't exist
Path(cache_dir).mkdir(exist_ok=True)
# Build URLs for Packages file
match distro:
case 'debian':
if( architecture == 'loong64' ):
base_url = "http://ftp.ports.debian.org/debian-ports/"
else:
base_url = "http://ftp.debian.org/debian/"
case 'ubuntu':
if(re.match("(i386|amd64)", architecture)): #regex as there is amd64 and amd64v3
base_url = "http://archive.ubuntu.com/ubuntu"
else:
base_url = "http://ports.ubuntu.com/"
packages_url = f"{base_url}/dists/{release_name}/main/binary-{architecture}/Packages.gz"
packages_path = os.path.join(cache_dir, f"{release_name}_{architecture}_Packages.gz")
# Check if we already have the Packages.gz file
if os.path.exists(packages_path):
print(f"Using cached Packages.gz: {packages_path}")
else:
print(f"Downloading Packages.gz for {release_name} ({architecture})...")
response = requests.get(packages_url, timeout=30)
response.raise_for_status()
# Save to cache
with open(packages_path, 'wb') as f:
f.write(response.content)
# Decompress and read
with gzip.open(packages_path, 'rt') as f:
packages_content = f.read()
# Parse the Packages file to find the package
package_info = parse_packages_for_package(packages_content, package_name)
if package_info:
# Synthesize the package filename
filename = synthesize_binary_package_filename(package_info)
#print(f"Synthesized binary package filename: {filename}")
return filename
else:
print(f"Binary package '{package_name}' not found for {architecture}/Packages.gz")
return None
def parse_packages_for_package(packages_content, package_name):
"""
Parse Packages.gz content to find package information
"""
# Split into individual package entries
packages = packages_content.split('\n\n')
for package_entry in packages:
if not package_entry.strip():
continue
package_info = {}
for line in package_entry.split('\n'):
if ':' in line:
key, value = line.split(':', 1)
package_info[key.strip().lower()] = value.strip()
# Check if this is our package
if package_info.get('package', '').lower() == package_name.lower():
return package_info
return None
def synthesize_binary_package_filename(package_info):
"""
Synthesize the Debian binary package filename from package info
"""
# Extract needed fields
package = package_info.get('package', 'unknown')
version = package_info.get('version', '0.0.0')
architecture = package_info.get('architecture', 'all')
# For binary packages, the filename format is:
# package_version_architecture.deb
filename = f"{package}_{version}_{architecture}.deb"
return filename
# Example usage:
if __name__ == "__main__":
releases = get_debian_release_names()
if('debian/rc-buggy' in releases):
releases.remove('debian/rc-buggy')
# FIXME: these are fetchable from changelogs.ubuntu.com/meta-release
# filter by 'Supported: 1'.
# Don't do this yet b/c jammy goes EOS Apr 2027, we don't know if we'll be ready.
# also resolute isn't in changelog as of 2025Dec03
releases += [ 'ubuntu/jammy', 'ubuntu/noble', 'ubuntu/plucky', 'ubuntu/questing', 'ubuntu/resolute' ]
release_hash = {}
for release in releases:
distro, release = release.split('/')
packages = {}
pkg_architecture = get_debian_srcpkg_architecture(distro, release, "base-files")
# Get architectures from InRelease
print("\n=== Architecture List ===")
arch_list = pkg_architecture.split()
if( 'any' in arch_list ):
architectures = get_debian_architectures(distro, release)
else:
architectures = arch_list
if( release == 'sid' ):
# loong64 is hidden away in /debian-ports/
architectures += ['loong64']
# Get binary package filename
#print("\n=== Binary Package ===")
# NOTE: we *cheat* here because base-files is always built for all architectures.
# this is NOT a generic method usable for all cases. for that you have to check Sources above
for architecture in architectures:
binary_filename = get_debian_binary_package_filename(distro, release, "base-files", architecture)
packages[architecture] = binary_filename
release_hash[release] = packages
json_content = json.dumps(release_hash)
print(json_content)
json_file_name = "base-files.json"
with open(json_file_name, "w") as outfile:
json.dump(all_info_result, outfile)
else:
print("failed to get package info")
sys.exit(1)
outfile.write(json_content)