fetch: change storage implementation
Instead of partitioning over YYYY/mm/dd, take the SHA256 of the URL and partition over the first, second and tail bytes of it. It has the upside of being more rsync-friendly and there is no longer a need to keep a different file, "everything", to look up already retrieved URLs. Also replace the "url" file with a "meta" file, organized as key=value, currently holding URL and retrieval date.
This commit is contained in:
parent
68e11fc972
commit
147695edc4
24
bin/fetch.sh
24
bin/fetch.sh
@ -44,20 +44,22 @@ shift $((OPTIND - 1))
|
|||||||
: ${ARCHIVE_BASEDIR:=~/tmp/archive}
|
: ${ARCHIVE_BASEDIR:=~/tmp/archive}
|
||||||
mkdir -p "$ARCHIVE_BASEDIR"
|
mkdir -p "$ARCHIVE_BASEDIR"
|
||||||
|
|
||||||
everything=$ARCHIVE_BASEDIR/everything
|
|
||||||
touch "$everything"
|
|
||||||
|
|
||||||
dir=$ARCHIVE_BASEDIR/$(date +%Y/%m/%d)
|
|
||||||
mkdir -p "$dir" || exit 1
|
|
||||||
|
|
||||||
rc=0
|
rc=0
|
||||||
for url; do
|
for url; do
|
||||||
sha=$(sha256 -qs "$url")
|
sha=$(sha256 -qs "$url")
|
||||||
outdir=$dir/$sha
|
|
||||||
|
|
||||||
if grep -q "^$url\$" "$everything"; then
|
t=$sha
|
||||||
|
h0=${t%${t#??}}
|
||||||
|
t=${t#??}
|
||||||
|
h1=${t%${t#??}}
|
||||||
|
t=${t#??}
|
||||||
|
ht=$t
|
||||||
|
|
||||||
|
outdir=$ARCHIVE_BASEDIR/$h0/$h1/$ht
|
||||||
|
|
||||||
|
if [ -f "$outdir/file" ]; then
|
||||||
printf "%s: already fetched %s\n" "${0##*/}" "$url" >&2
|
printf "%s: already fetched %s\n" "${0##*/}" "$url" >&2
|
||||||
printf "%s\n" "$ARCHIVE_BASEDIR"/*/*/*/"$sha"/file
|
printf "%s\n" "$outdir/file"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -65,8 +67,8 @@ for url; do
|
|||||||
|
|
||||||
{
|
{
|
||||||
fetch_cmd -o "$outdir/file" "$url" &&
|
fetch_cmd -o "$outdir/file" "$url" &&
|
||||||
printf "%s\n" "$url" >"$outdir/url" &&
|
printf "%s=%s\n" \
|
||||||
printf "%s\n" "$url" >>"$everything" &&
|
url "$url" date "$(date +%Y-%m-%d)" >"$outdir/meta" &&
|
||||||
printf "%s\n" "$outdir/file"
|
printf "%s\n" "$outdir/file"
|
||||||
} || rc=1
|
} || rc=1
|
||||||
done
|
done
|
||||||
|
Loading…
Reference in New Issue
Block a user