summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke T. Shumaker <lukeshu@lukeshu.com>2023-10-14 19:05:21 -0600
committerLuke T. Shumaker <lukeshu@lukeshu.com>2023-10-14 19:05:21 -0600
commitc6ba01f3f27872a7e9479ec4cd3da018f231b556 (patch)
tree57c307936bb30734cc29e64e3495ec441ea94380
parenta76e7458aa34ebe08cbf7048df5d6b183f5bbaef (diff)
parente35a01b00eb39366b6c8b1294c6a766838313f38 (diff)
Merge branch 'lukeshu/tidy'
-rw-r--r--Makefile83
-rwxr-xr-xbin/cdxget8
-rwxr-xr-xbin/gitify6
-rwxr-xr-xbin/poolify48
-rwxr-xr-xbin/wayfore4
5 files changed, 108 insertions, 41 deletions
diff --git a/Makefile b/Makefile
index ad7e53e..63fc135 100644
--- a/Makefile
+++ b/Makefile
@@ -6,17 +6,28 @@ url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )
+# This is split into stages for when Make has to make decisions about
+# the build tree based on the output of a previous stage. That is:
+# these stages exist for a technical GNU Make reason, not for
+# human-comprehensibility reasons; so stages have lopsided sizes; the
+# first two are very small, and almost everything is in the third
+# stage.
all:
+ # Stage 1 ######################################################################
$(MAKE) dat/urlkeys.mk
+ # Stage 2 ######################################################################
$(MAKE) dat/index.mk
+ # Stage 3 ######################################################################
$(MAKE) dat/git
-
-fix:
- grep -rl '<html><body><h1>503' dat | xargs rm -fv --
-
-.PHONY: all fix
+.PHONY: all
# Stage 1 ######################################################################
+#
+# Fetch a listing of all relevant URLs.
+#
+# - `dat/cdxindex.txt`
+# - `dat/urlkeys.txt`
+# - `dat/urlkeys.mk`
dat:
mkdir -p $@
@@ -28,6 +39,17 @@ dat/urlkeys.mk: dat/urlkeys.txt
< $< sed 's/^/urlkeys+=/' > $@
# Stage 2 ######################################################################
+#
+# Fetch the history for each relevant URL.
+#
+# - `dat/each-cdx/$(urlkey).txt` (for each urlkey in `dat/urlkeys.mk`)
+#
+# - `dat/index.txt`
+# has a line for each relevant URL:
+#
+# ${wayback_timestamp:YYYYmmddHHMMSS} ${url}
+#
+# - `dat/index.mk`
ifneq ($(wildcard dat/urlkeys.mk),)
include dat/urlkeys.mk
@@ -40,35 +62,70 @@ dat/index.mk: dat/index.txt
< $< sed -e 's,^,index+=,' -e 's, ,/,' > $@
# Stage 3 ######################################################################
+#
+# The main stage.
ifneq ($(wildcard dat/index.mk),)
-include dat/index.mk
-dat/content-dir/%/index.wahtml:
+# Part 1: Directory indexes:
+#
+# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/index.html`
+#
+# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/readme.txt`
+#
+# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/metadata.txt`
+# has a line for each file mentioned in index.html (this format is
+# controlled by `bin/fmt-metadata`):
+#
+# ${file_name} ${file_timestamp:YYYY-mm-dd HH:MM}
+dat/content-dir/%/index.html:
@mkdir -p '$(@D)'
- curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@
-dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml
- < $< wayfore > $@
+ curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
- < $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@
+ < $< sed -n '/^<[pP][rR][eE]>$$/,/<\/[pP][rR][eE]>/p' | sed -e 1d -e 's,</[pP][rR][eE]>.*,,' > $@
dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
- < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
+ < $< grep -i '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
+# Part 2: File contents:
+# - `dat/content-file/$(wayback_timestamp:YYYYmmddHHMMSS)/$(file_murl)`
dat/content-file/%:
@mkdir -p '$(@D)'
- curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@
+ curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
download += $(content-file)
+# `download` is a convenience target to download files without
+# processing them. It isn't depended on by anything.
download: $(download)
.PHONY: download
+# Part 3: Aggregate:
+# - `dat/metadata.txt`
+# has a line for each file mentioned in any index.html:
+#
+# ${dirindex_wayback_timestamp:YYYYmmddHHMMSS} ${branch_name}/${file_name} ${file_html_timestamp:YYYY-mm-dd HH:MM}
+#
+# where the ${dirindex_wayback_timestamp} and ${branch_name} are
+# determined from the path to the relevant index.html.
+#
+# - `dat/pools/`
+# + pass 1 and pass 1.5
+# * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/`
+# * `dat/pools/snaps/${dirindex_wayback_timestamp:YYYYmmddHHMMSS}-${branch_name}/${file_name}` (symlink to the /files/ file)
+# + pass 2 and pass 3:
+# * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/${file_name}` (for each existing /file/ dir)
+#
dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt
- grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -r -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
+ grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
dat/pools: $(download) dat/metadata.txt dat/index.txt
rm -rf -- $@ $@.bak
poolify dat/metadata.txt dat/index.txt || $(dirfail)
+
+# Part 4: Turn each `dat/pools/snaps/*` directory into a Git commit
+#
+# - `dat/git/`
dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
rm -rf -- $@ $@.bak
gitify $@ || $(dirfail)
diff --git a/bin/cdxget b/bin/cdxget
index 46d56c4..6844fa6 100755
--- a/bin/cdxget
+++ b/bin/cdxget
@@ -1,7 +1,9 @@
#!/usr/bin/env bash
-url='http://web.archive.org/cdx/search/cdx?'
+url='http://web.archive.org/cdx/search/cdx'
+s='?'
for arg in "$@"; do
- url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)&"
+ url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)"
+ s='&'
done
-curl -sL "$url"
+curl -sfL "$url"
diff --git a/bin/gitify b/bin/gitify
index 1e5d43d..b245f3e 100755
--- a/bin/gitify
+++ b/bin/gitify
@@ -82,7 +82,7 @@ main() {
HACK_NAME='Luke Shumaker'
HACK_EMAIL='lukeshu@lukeshu.com'
- gitdate="$(sed -r 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6 '"$HACK_TZ"'/' <<<"$time")"
+ gitdate="$(sed -E 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6 '"$HACK_TZ"'/' <<<"$time")"
git add .
@@ -99,11 +99,11 @@ main() {
lastbranch="$branch"
fi
if [[ "$branch" == PROGRAMS/CVTUTF ]] && git log -n1 --stat|grep -qF 'ExpectedOutput.txt'; then
- git filter-branch -f --parent-filter 'cat; echo " -p BETA/CVTUTF-1-3"' HEAD^..HEAD
+ FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --parent-filter 'cat; echo " -p BETA/CVTUTF-1-3"' HEAD^..HEAD
git update-ref -d refs/original/refs/heads/"$branch"
fi
if [[ "$branch" == PROGRAMS/CVTUTF.OLD ]] && git log -n1 --stat|grep -qi '.*\.c\s'; then
- git filter-branch -f --parent-filter 'cat; echo " -p PROGRAMS/CVTUTF^"' HEAD^..HEAD
+ FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --parent-filter 'cat; echo " -p PROGRAMS/CVTUTF^"' HEAD^..HEAD
git update-ref -d refs/original/refs/heads/"$branch"
fi
fi
diff --git a/bin/poolify b/bin/poolify
index e256157..649b71b 100755
--- a/bin/poolify
+++ b/bin/poolify
@@ -13,14 +13,27 @@ main() {
set -euE -o pipefail
shopt -s nullglob
- echo '# Pass 1'
- declare -A rewrite
- rewrite[200109261739]=200303310700
- while read -r snap name date time size; do
+ arg_metadata_txt=$1
+ arg_index_txt=$2
+
+ # Overrides ############################################################
+
+ declare -A override_datetime
+ override_datetime[200109261739]=200303310700
+
+ override_synthetic_listings=(
+ #YYYYMMDDHHMM branch_name newfiles
+ '200307291500 ALPHA/CVTUTF-1-1 ExpectedOutput.txt readme.txt'
+ )
+
+ # Main #################################################################
+
+ echo '# Pass 1 (initialize snapshots from $arg_metadata_txt)'
+ while read -r snap name date time; do
dirpart="${name%/*}"
filepart="${name##*/}"
datetime="${date//-/}${time//:/}"
- datetime="${rewrite[$datetime]:-$datetime}"
+ datetime="${override_datetime[$datetime]:-$datetime}"
filedir=dat/pools/files/"${datetime}-${name//\//_}"
snapdir=dat/pools/snaps/"${snap}-${dirpart//\//_}"
if [[ -d "${filedir/.OLD/}" ]]; then
@@ -28,9 +41,9 @@ main() {
fi
mkdir -p -- "$filedir" "$snapdir"
ln -sr "$filedir/$filepart" "$snapdir"
- done < "$1"
+ done < "$arg_metadata_txt"
- echo '# Pass 1.5'
+ echo '# Pass 1.5 (initialize synthetic snapshots)'
# Looking at the data, there are 3 revisions that we DON'T
# have directory listings for. So we need to synthesize
# those.
@@ -39,7 +52,8 @@ main() {
# synthesizing anything, then looking for files ending in
# ".1". They are created during pass 2 if we have a file with
# no matching listing.
- while read -r datetime dirpart newfiles; do
+ for line in "${override_synthetic_listings[@]}"; do
+ read -r datetime dirpart newfiles <<<"$line"
# We need to figure out which files to put in the
# directory listing. We're going to do that by
# mimicking the previous listing with that dirpart.
@@ -65,12 +79,14 @@ main() {
rm -- "$snapdir/$filepart"
ln -sr "$filedir/$filepart" "$snapdir"
done
- done < <(printf '%s\n' \
- '200307291500 ALPHA/CVTUTF-1-1 ExpectedOutput.txt readme.txt' \
- )
+ done
- echo '# Pass 2'
+ echo '# Pass 2 (resolve files)'
while read -r time url; do
+ if [[ "$url" == */ ]]; then
+ # Skip directories
+ continue
+ fi
if [[ "$url" == */2.0-Update/* ]]; then
# Gross hack
continue
@@ -79,10 +95,6 @@ main() {
dirpart="${name%/*}"
filepart="${name##*/}"
- if [[ -z "$filepart" ]]; then
- continue
- fi
-
pools=(dat/pools/files/*-"${name//\//_}")
if [[ "$name" = *.OLD* ]]; then
pname="${name//\//_}"
@@ -119,9 +131,9 @@ main() {
fi
i+=1
done
- done < "$2"
+ done < "$arg_index_txt"
- echo '# Pass 3'
+ echo '# Pass 3 (resolve missing files)'
while read -r missing; do
if [[ -f "${missing/.OLD}/${missing##*_}" ]]; then
ln -sr "${missing/.OLD}/${missing##*_}" "$missing"
diff --git a/bin/wayfore b/bin/wayfore
deleted file mode 100755
index b0bde8a..0000000
--- a/bin/wayfore
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/sed -zrf
-# The opposite of 'wayback'
-s/(<[hH][eE][aA][dD]>).*<!-- End Wayback Rewrite JS Include -->/\1/
-s/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->//