fetch: change storage implementation
Instead of partitioning over YYYY/mm/dd, take the SHA256 of the URL and partition over the first, second and tail bytes of it. It has the upside of being more rsync-friendly and there is no longer a need to keep a different file, "everything", to look up already retrieved URLs. Also replace the "url" file with a "meta" file, organized as key=value, currently holding URL and retrieval date.
This commit is contained in:
parent
68e11fc972
commit
147695edc4
24
bin/fetch.sh
24
bin/fetch.sh
@ -44,20 +44,22 @@ shift $((OPTIND - 1))
|
||||
: ${ARCHIVE_BASEDIR:=~/tmp/archive}
|
||||
mkdir -p "$ARCHIVE_BASEDIR"
|
||||
|
||||
everything=$ARCHIVE_BASEDIR/everything
|
||||
touch "$everything"
|
||||
|
||||
dir=$ARCHIVE_BASEDIR/$(date +%Y/%m/%d)
|
||||
mkdir -p "$dir" || exit 1
|
||||
|
||||
rc=0
|
||||
for url; do
|
||||
sha=$(sha256 -qs "$url")
|
||||
outdir=$dir/$sha
|
||||
|
||||
if grep -q "^$url\$" "$everything"; then
|
||||
t=$sha
|
||||
h0=${t%${t#??}}
|
||||
t=${t#??}
|
||||
h1=${t%${t#??}}
|
||||
t=${t#??}
|
||||
ht=$t
|
||||
|
||||
outdir=$ARCHIVE_BASEDIR/$h0/$h1/$ht
|
||||
|
||||
if [ -f "$outdir/file" ]; then
|
||||
printf "%s: already fetched %s\n" "${0##*/}" "$url" >&2
|
||||
printf "%s\n" "$ARCHIVE_BASEDIR"/*/*/*/"$sha"/file
|
||||
printf "%s\n" "$outdir/file"
|
||||
continue
|
||||
fi
|
||||
|
||||
@ -65,8 +67,8 @@ for url; do
|
||||
|
||||
{
|
||||
fetch_cmd -o "$outdir/file" "$url" &&
|
||||
printf "%s\n" "$url" >"$outdir/url" &&
|
||||
printf "%s\n" "$url" >>"$everything" &&
|
||||
printf "%s=%s\n" \
|
||||
url "$url" date "$(date +%Y-%m-%d)" >"$outdir/meta" &&
|
||||
printf "%s\n" "$outdir/file"
|
||||
} || rc=1
|
||||
done
|
||||
|
Loading…
Reference in New Issue
Block a user