Lucas
147695edc4
Instead of partitioning over YYYY/mm/dd, take the SHA256 of the URL and partition over the first, second and tail bytes of it. It has the upside of being more rsync-friendly and there is no longer a need to keep a different file, "everything", to look up already retrieved URLs. Also replace the "url" file with a "meta" file, organized as key=value, currently holding URL and retrieval date.
76 lines
1.6 KiB
Bash
76 lines
1.6 KiB
Bash
#!/bin/sh
|
|
# fetch
|
|
# Written in 2019-2022 by Lucas
|
|
# CC0 1.0 Universal/Public domain - No rights reserved
|
|
#
|
|
# To the extent possible under law, the author(s) have dedicated all
|
|
# copyright and related and neighboring rights to this software to the
|
|
# public domain worldwide. This software is distributed without any
|
|
# warranty. You should have received a copy of the CC0 Public Domain
|
|
# Dedication along with this software. If not, see
|
|
# <http://creativecommons.org/publicdomain/zero/1.0/>.
|
|
|
|
usage()
|
|
{
|
|
printf "Usage: %s [-T] URL [URL ...]\n" "${0##*/}" >&2
|
|
exit 1
|
|
}
|
|
|
|
fetch_cmd()
|
|
{
|
|
# ftp(1) is chatty if stdin is a terminal. It prints its operations to
|
|
# stdout if "-o output" is used. In that case, we'll need to redirect
|
|
# stdout to the tty we're being called from, in order to be able to
|
|
# pipe the output to another command.
|
|
if tty=$(tty) && [ X"$tty" != X"not a tty" ]; then
|
|
$torsocks ftp "$@" >"$tty"
|
|
else
|
|
$torsocks ftp "$@"
|
|
fi
|
|
}
|
|
|
|
torsocks=torsocks
|
|
while getopts T flag; do
|
|
case $flag in
|
|
T) torsocks=
|
|
;;
|
|
*) usage
|
|
;;
|
|
esac
|
|
done
|
|
shift $((OPTIND - 1))
|
|
[ $# -gt 0 ] || usage
|
|
|
|
: ${ARCHIVE_BASEDIR:=~/tmp/archive}
|
|
mkdir -p "$ARCHIVE_BASEDIR"
|
|
|
|
rc=0
|
|
for url; do
|
|
sha=$(sha256 -qs "$url")
|
|
|
|
t=$sha
|
|
h0=${t%${t#??}}
|
|
t=${t#??}
|
|
h1=${t%${t#??}}
|
|
t=${t#??}
|
|
ht=$t
|
|
|
|
outdir=$ARCHIVE_BASEDIR/$h0/$h1/$ht
|
|
|
|
if [ -f "$outdir/file" ]; then
|
|
printf "%s: already fetched %s\n" "${0##*/}" "$url" >&2
|
|
printf "%s\n" "$outdir/file"
|
|
continue
|
|
fi
|
|
|
|
mkdir -p "$outdir"
|
|
|
|
{
|
|
fetch_cmd -o "$outdir/file" "$url" &&
|
|
printf "%s=%s\n" \
|
|
url "$url" date "$(date +%Y-%m-%d)" >"$outdir/meta" &&
|
|
printf "%s\n" "$outdir/file"
|
|
} || rc=1
|
|
done
|
|
exit $rc
|