From 147695edc4daf33c73bc01baf1451bf65e7aeb0a Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 3 Aug 2022 03:33:28 +0000 Subject: [PATCH] fetch: change storage implementation Instead of partitioning over YYYY/mm/dd, take the SHA256 of the URL and partition over the first, second and tail bytes of it. It has the upside of being more rsync-friendly and there is no longer a need to keep a different file, "everything", to look up already retrieved URLs. Also replace the "url" file with a "meta" file, organized as key=value, currently holding URL and retrieval date. --- bin/fetch.sh | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bin/fetch.sh b/bin/fetch.sh index ba6a1de..1631e87 100644 --- a/bin/fetch.sh +++ b/bin/fetch.sh @@ -44,20 +44,22 @@ shift $((OPTIND - 1)) : ${ARCHIVE_BASEDIR:=~/tmp/archive} mkdir -p "$ARCHIVE_BASEDIR" -everything=$ARCHIVE_BASEDIR/everything -touch "$everything" - -dir=$ARCHIVE_BASEDIR/$(date +%Y/%m/%d) -mkdir -p "$dir" || exit 1 - rc=0 for url; do sha=$(sha256 -qs "$url") - outdir=$dir/$sha - if grep -q "^$url\$" "$everything"; then + t=$sha + h0=${t%${t#??}} + t=${t#??} + h1=${t%${t#??}} + t=${t#??} + ht=$t + + outdir=$ARCHIVE_BASEDIR/$h0/$h1/$ht + + if [ -f "$outdir/file" ]; then printf "%s: already fetched %s\n" "${0##*/}" "$url" >&2 - printf "%s\n" "$ARCHIVE_BASEDIR"/*/*/*/"$sha"/file + printf "%s\n" "$outdir/file" continue fi @@ -65,8 +67,8 @@ for url; do { fetch_cmd -o "$outdir/file" "$url" && - printf "%s\n" "$url" >"$outdir/url" && - printf "%s\n" "$url" >>"$everything" && + printf "%s=%s\n" \ + url "$url" date "$(date +%Y-%m-%d)" >"$outdir/meta" && printf "%s\n" "$outdir/file" } || rc=1 done