env/bin/fetch.sh
Lucas 147695edc4 fetch: change storage implementation
Instead of partitioning over YYYY/mm/dd, take the SHA256 of the URL and
partition over the first, second and tail bytes of it. It has the upside
of being more rsync-friendly and there is no longer a need to keep a
different file, "everything", to look up already retrieved URLs. Also
replace the "url" file with a "meta" file, organized as key=value,
currently holding URL and retrieval date.
2022-08-03 03:33:28 +00:00

76 lines
1.6 KiB
Bash

#!/bin/sh
# fetch
# Written in 2019-2022 by Lucas
# CC0 1.0 Universal/Public domain - No rights reserved
#
# To the extent possible under law, the author(s) have dedicated all
# copyright and related and neighboring rights to this software to the
# public domain worldwide. This software is distributed without any
# warranty. You should have received a copy of the CC0 Public Domain
# Dedication along with this software. If not, see
# <http://creativecommons.org/publicdomain/zero/1.0/>.
usage()
{
printf "Usage: %s [-T] URL [URL ...]\n" "${0##*/}" >&2
exit 1
}
fetch_cmd()
{
# ftp(1) is chatty if stdin is a terminal. It prints its operations to
# stdout if "-o output" is used. In that case, we'll need to redirect
# stdout to the tty we're being called from, in order to be able to
# pipe the output to another command.
if tty=$(tty) && [ X"$tty" != X"not a tty" ]; then
$torsocks ftp "$@" >"$tty"
else
$torsocks ftp "$@"
fi
}
torsocks=torsocks
while getopts T flag; do
case $flag in
T) torsocks=
;;
*) usage
;;
esac
done
shift $((OPTIND - 1))
[ $# -gt 0 ] || usage
: ${ARCHIVE_BASEDIR:=~/tmp/archive}
mkdir -p "$ARCHIVE_BASEDIR"
rc=0
for url; do
sha=$(sha256 -qs "$url")
t=$sha
h0=${t%${t#??}}
t=${t#??}
h1=${t%${t#??}}
t=${t#??}
ht=$t
outdir=$ARCHIVE_BASEDIR/$h0/$h1/$ht
if [ -f "$outdir/file" ]; then
printf "%s: already fetched %s\n" "${0##*/}" "$url" >&2
printf "%s\n" "$outdir/file"
continue
fi
mkdir -p "$outdir"
{
fetch_cmd -o "$outdir/file" "$url" &&
printf "%s=%s\n" \
url "$url" date "$(date +%Y-%m-%d)" >"$outdir/meta" &&
printf "%s\n" "$outdir/file"
} || rc=1
done
exit $rc