summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--go/src/cow-dedupe/dedupe.go70
1 files changed, 49 insertions, 21 deletions
diff --git a/go/src/cow-dedupe/dedupe.go b/go/src/cow-dedupe/dedupe.go
index 457e377..84140f5 100644
--- a/go/src/cow-dedupe/dedupe.go
+++ b/go/src/cow-dedupe/dedupe.go
@@ -80,11 +80,11 @@ func getFiemaps(paths []string) map[string][]string {
return ret
}
-func getChecksums(sl statusline.StatusLine, cmd []string, paths []string) map[string][]string {
+func getChecksums(sl statusline.StatusLine, slfmt string, basecmd []string, paths []string) map[string][]string {
ret := map[string][]string{}
cnt := 0
- sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", cmd, cnt, len(paths)))
+ sl.Put(fmt.Sprintf(slfmt, cnt, len(paths)))
pathsTodo := paths
for len(pathsTodo) > 0 {
@@ -99,7 +99,7 @@ func getChecksums(sl statusline.StatusLine, cmd []string, paths []string) map[st
}
pathsTodo = pathsTodo[len(pathsDoing):]
- cmd := exec.Command(cmd[0], append(cmd[1:], pathsDoing...)...)
+ cmd := exec.Command(basecmd[0], append(basecmd[1:], pathsDoing...)...)
stdout, err := cmd.StdoutPipe()
errhandle(err)
cmd.Stderr = os.Stderr
@@ -120,13 +120,12 @@ func getChecksums(sl statusline.StatusLine, cmd []string, paths []string) map[st
ret[checksum] = append(ret[checksum], filename)
cnt++
- sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", cmd, cnt, len(paths)))
+ sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", basecmd, cnt, len(paths)))
}
errhandle(cmd.Wait())
}
- sl.Put(fmt.Sprintf("Generating checksums (%v) for files... done; summed %d files", cmd, cnt))
- sl.End(true)
+ sl.Put(fmt.Sprintf("Generating checksums (%v) for files... done; summed %d files", basecmd, cnt))
return ret
}
@@ -157,6 +156,21 @@ func dedupe(srcFile string, dupFiles []string) error {
return cmd.Run()
}
+// [ 0s ] size-set[n/d]->c :: sha256-set[t+n/d]->c :: (summed=n deduped=n) :: verb[m/n]
+type fancyStatus struct {
+ sizeN, sizeD, sizeC int
+ sumT, sumN, sumD, sumC int
+ summed, deduped int
+ verb string
+}
+
+func (s fancyStatus) String() string {
+ return fmt.Sprintf("size-set[%d/%d]->%d :: sha256-set[%d+%d/%d]->%d :: (summed=%d deduped=%d) :: %s",
+ s.sizeN, s.sizeD, s.sizeC,
+ s.sumT, s.sumN, s.sumD, s.sumC,
+ s.summed, s.deduped, s.verb)
+}
+
func main() {
// we have low parallelism, don't let syscalls fan-out weird
// on many-core systems
@@ -165,7 +179,7 @@ func main() {
fiemap2filenames := getFiemaps(os.Args[1:])
sl := statusline.StopWatch(statusline.New(os.Stderr), time.Second)
- sl.Put( "Building list of spanning files...")
+ sl.Put("Building list of spanning files...")
filename2fiemap := map[string]string{}
for fiemap, filenames := range fiemap2filenames {
@@ -187,28 +201,34 @@ func main() {
sl.Put(fmt.Sprintf("Building list of spanning files... done; %d files", len(spanningFiles)))
sl.End(true)
- size2filenames := getChecksums(myStatusLine(), []string{"stat", "--printf=%s %n\\n", "--"}, spanningFiles)
+ sl = myStatusLine()
+ size2filenames := getChecksums(sl, "Getting sizes for files... %d/%d",
+ []string{"stat", "--printf=%s %n\\n", "--"}, spanningFiles)
+ sl.End(true)
+
fmt.Fprintf(os.Stderr, " -> %d sets", len(size2filenames))
pruneSingles(size2filenames)
fmt.Fprintf(os.Stderr, " -> %d non-trivial sets\n", len(size2filenames))
sl = myStatusLine()
- sizeCnt := 0
+ var status fancyStatus
+ status.sizeD = len(size2filenames)
for _, filenames := range size2filenames {
- // The list of specific files in size2filenames isn't
- // significant; they'e just proxies for fiemaps.
- sizeStatus := fmt.Sprintf("Working on size-set %d/%d of %d fiemaps",
- sizeCnt, len(size2filenames), len(filenames))
- sl.Put(sizeStatus)
+ status.sizeC = len(filenames)
// Now do strict hashing, instead of the incredibly
// sloppy (but fast) size-bucketing.
- checksum2filenames := getChecksums(statusline.Prefix(myStatusLine(), sizeStatus+" :: "), []string{"sha256sum", "--"}, filenames)
- sl.Put(sizeStatus)
+ status.verb = "sha256sum[%d/%d]"
+ checksum2filenames := getChecksums(sl, status.String(), []string{"sha256sum", "--"}, filenames)
+ status.summed += len(filenames)
+ status.verb = "pruneSingles"
+ sl.Put(status.String())
pruneSingles(checksum2filenames)
// And loop over the smaller, precise buckets
- sumCnt := 0
+ status.sumD = len(checksum2filenames)
for _, filenames := range checksum2filenames {
- sl.Put(fmt.Sprintf("%s :: sha256-set %d/%d of %d fiemaps", sizeStatus, sumCnt, len(checksum2filenames), len(filenames)))
+ status.sumC = len(filenames)
+ status.verb = "prep"
+ sl.Put(status.String())
var fiemaps []string
for _, filename := range filenames {
fiemaps = append(fiemaps, filename2fiemap[filename])
@@ -233,11 +253,19 @@ func main() {
dupFiles = append(dupFiles, fiemap2filenames[fiemap]...)
}
// And actually dedupe those
+ status.verb = fmt.Sprintf("dedupe %q (and %d more)", filepath.Base(srcFile), len(dupFiles))
+ sl.Put(status.String())
dedupe(srcFile, dupFiles) // XXX ignore error?
- sumCnt++
+ status.deduped++
+ status.sumN++
}
- sizeCnt++
+ status.sumT += len(checksum2filenames)
+ status.sumN = 0
+ status.sumD = 0
+ status.sumC = 0
+ status.sizeN++
}
- sl.Put(fmt.Sprintf("Working on size-set %d/%d... done", sizeCnt, len(size2filenames)))
+ status.verb = "done"
+ sl.Put(status.String())
sl.End(true)
}