diff options
-rw-r--r-- | go/src/cow-dedupe/dedupe.go | 70 |
1 files changed, 49 insertions, 21 deletions
diff --git a/go/src/cow-dedupe/dedupe.go b/go/src/cow-dedupe/dedupe.go index 457e377..84140f5 100644 --- a/go/src/cow-dedupe/dedupe.go +++ b/go/src/cow-dedupe/dedupe.go @@ -80,11 +80,11 @@ func getFiemaps(paths []string) map[string][]string { return ret } -func getChecksums(sl statusline.StatusLine, cmd []string, paths []string) map[string][]string { +func getChecksums(sl statusline.StatusLine, slfmt string, basecmd []string, paths []string) map[string][]string { ret := map[string][]string{} cnt := 0 - sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", cmd, cnt, len(paths))) + sl.Put(fmt.Sprintf(slfmt, cnt, len(paths))) pathsTodo := paths for len(pathsTodo) > 0 { @@ -99,7 +99,7 @@ func getChecksums(sl statusline.StatusLine, cmd []string, paths []string) map[st } pathsTodo = pathsTodo[len(pathsDoing):] - cmd := exec.Command(cmd[0], append(cmd[1:], pathsDoing...)...) + cmd := exec.Command(basecmd[0], append(basecmd[1:], pathsDoing...)...) stdout, err := cmd.StdoutPipe() errhandle(err) cmd.Stderr = os.Stderr @@ -120,13 +120,12 @@ func getChecksums(sl statusline.StatusLine, cmd []string, paths []string) map[st ret[checksum] = append(ret[checksum], filename) cnt++ - sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", cmd, cnt, len(paths))) + sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", basecmd, cnt, len(paths))) } errhandle(cmd.Wait()) } - sl.Put(fmt.Sprintf("Generating checksums (%v) for files... done; summed %d files", cmd, cnt)) - sl.End(true) + sl.Put(fmt.Sprintf("Generating checksums (%v) for files... done; summed %d files", basecmd, cnt)) return ret } @@ -157,6 +156,21 @@ func dedupe(srcFile string, dupFiles []string) error { return cmd.Run() } +// [ 0s ] size-set[n/d]->c :: sha256-set[t+n/d]->c :: (summed=n deduped=n) :: verb[m/n] +type fancyStatus struct { + sizeN, sizeD, sizeC int + sumT, sumN, sumD, sumC int + summed, deduped int + verb string +} + +func (s fancyStatus) String() string { + return fmt.Sprintf("size-set[%d/%d]->%d :: sha256-set[%d+%d/%d]->%d :: (summed=%d deduped=%d) :: %s", + s.sizeN, s.sizeD, s.sizeC, + s.sumT, s.sumN, s.sumD, s.sumC, + s.summed, s.deduped, s.verb) +} + func main() { // we have low parallelism, don't let syscalls fan-out weird // on many-core systems @@ -165,7 +179,7 @@ func main() { fiemap2filenames := getFiemaps(os.Args[1:]) sl := statusline.StopWatch(statusline.New(os.Stderr), time.Second) - sl.Put( "Building list of spanning files...") + sl.Put("Building list of spanning files...") filename2fiemap := map[string]string{} for fiemap, filenames := range fiemap2filenames { @@ -187,28 +201,34 @@ func main() { sl.Put(fmt.Sprintf("Building list of spanning files... done; %d files", len(spanningFiles))) sl.End(true) - size2filenames := getChecksums(myStatusLine(), []string{"stat", "--printf=%s %n\\n", "--"}, spanningFiles) + sl = myStatusLine() + size2filenames := getChecksums(sl, "Getting sizes for files... %d/%d", + []string{"stat", "--printf=%s %n\\n", "--"}, spanningFiles) + sl.End(true) + fmt.Fprintf(os.Stderr, " -> %d sets", len(size2filenames)) pruneSingles(size2filenames) fmt.Fprintf(os.Stderr, " -> %d non-trivial sets\n", len(size2filenames)) sl = myStatusLine() - sizeCnt := 0 + var status fancyStatus + status.sizeD = len(size2filenames) for _, filenames := range size2filenames { - // The list of specific files in size2filenames isn't - // significant; they'e just proxies for fiemaps. - sizeStatus := fmt.Sprintf("Working on size-set %d/%d of %d fiemaps", - sizeCnt, len(size2filenames), len(filenames)) - sl.Put(sizeStatus) + status.sizeC = len(filenames) // Now do strict hashing, instead of the incredibly // sloppy (but fast) size-bucketing. - checksum2filenames := getChecksums(statusline.Prefix(myStatusLine(), sizeStatus+" :: "), []string{"sha256sum", "--"}, filenames) - sl.Put(sizeStatus) + status.verb = "sha256sum[%d/%d]" + checksum2filenames := getChecksums(sl, status.String(), []string{"sha256sum", "--"}, filenames) + status.summed += len(filenames) + status.verb = "pruneSingles" + sl.Put(status.String()) pruneSingles(checksum2filenames) // And loop over the smaller, precise buckets - sumCnt := 0 + status.sumD = len(checksum2filenames) for _, filenames := range checksum2filenames { - sl.Put(fmt.Sprintf("%s :: sha256-set %d/%d of %d fiemaps", sizeStatus, sumCnt, len(checksum2filenames), len(filenames))) + status.sumC = len(filenames) + status.verb = "prep" + sl.Put(status.String()) var fiemaps []string for _, filename := range filenames { fiemaps = append(fiemaps, filename2fiemap[filename]) @@ -233,11 +253,19 @@ func main() { dupFiles = append(dupFiles, fiemap2filenames[fiemap]...) } // And actually dedupe those + status.verb = fmt.Sprintf("dedupe %q (and %d more)", filepath.Base(srcFile), len(dupFiles)) + sl.Put(status.String()) dedupe(srcFile, dupFiles) // XXX ignore error? - sumCnt++ + status.deduped++ + status.sumN++ } - sizeCnt++ + status.sumT += len(checksum2filenames) + status.sumN = 0 + status.sumD = 0 + status.sumC = 0 + status.sizeN++ } - sl.Put(fmt.Sprintf("Working on size-set %d/%d... done", sizeCnt, len(size2filenames))) + status.verb = "done" + sl.Put(status.String()) sl.End(true) } |