fetch: change storage implementation

Instead of partitioning over YYYY/mm/dd, take the SHA256 of the URL and
partition over the first, second and tail bytes of it. It has the upside
of being more rsync-friendly and there is no longer a need to keep a
different file, "everything", to look up already retrieved URLs. Also
replace the "url" file with a "meta" file, organized as key=value,
currently holding URL and retrieval date.
This commit is contained in:
Lucas 2022-08-03 03:33:28 +00:00
parent 68e11fc972
commit 147695edc4

View File

@ -44,20 +44,22 @@ shift $((OPTIND - 1))
: ${ARCHIVE_BASEDIR:=~/tmp/archive}
mkdir -p "$ARCHIVE_BASEDIR"
everything=$ARCHIVE_BASEDIR/everything
touch "$everything"
dir=$ARCHIVE_BASEDIR/$(date +%Y/%m/%d)
mkdir -p "$dir" || exit 1
rc=0
for url; do
sha=$(sha256 -qs "$url")
outdir=$dir/$sha
if grep -q "^$url\$" "$everything"; then
t=$sha
h0=${t%${t#??}}
t=${t#??}
h1=${t%${t#??}}
t=${t#??}
ht=$t
outdir=$ARCHIVE_BASEDIR/$h0/$h1/$ht
if [ -f "$outdir/file" ]; then
printf "%s: already fetched %s\n" "${0##*/}" "$url" >&2
printf "%s\n" "$ARCHIVE_BASEDIR"/*/*/*/"$sha"/file
printf "%s\n" "$outdir/file"
continue
fi
@ -65,8 +67,8 @@ for url; do
{
fetch_cmd -o "$outdir/file" "$url" &&
printf "%s\n" "$url" >"$outdir/url" &&
printf "%s\n" "$url" >>"$everything" &&
printf "%s=%s\n" \
url "$url" date "$(date +%Y-%m-%d)" >"$outdir/meta" &&
printf "%s\n" "$outdir/file"
} || rc=1
done