I did make an automated script to deduplicate biblio records. Thanks, Joy, for the thoughts about choosing the "biggest" record. While it is dangerous to say it is done before the librarians have told me I did a great job, I will go ahead and put the script here just in case someone else finds it useful some day. It does create a log file of all the biblio records it deletes. But, does not save the contents. At least you can look at whatever remains and try to remember what the contents used to be... My apologies to any Perl masters who might be offended by my attempt. :) ----- #!/usr/bin/perl #For use with Koha 17.05.x $INSTANCE=""; #default instance $KOHAMYSQL="koha-mysql"; #put a path here if this cannot be found @TABLELIST=(aqorders, article_requests, biblioimages, items, hold_fill_targets, biblioitems, old_reserves, reserves, ratings, reviews, tags_all, virtualshelfcontents); my $doOne=0; #set to 1 to test my $OutFile="DeDup_log.txt"; ##################################### # No need to edit below this line ### ##################################### open(LOG,'>',$OutFile); #read in the instance if(@ARGV > 0) { $INSTANCE = $ARGV[0]; } else { die "Specify a koha instance to dedup"; } sub RunSQL { my $sql = join ' ', @_; my $command = "echo '$sql' | $KOHAMYSQL $INSTANCE"; my $error; my $out; print "Command: $command\n" if ($doOne == 1); $out = qx{$command 2>&1}; $error=$?; return $error, $out; } my $counter=0; #Find all the duplicates print "Calculating: \n"; print LOG "Calculating: \n"; ($status, $output) = RunSQL ("SELECT GROUP_CONCAT(biblionumber SEPARATOR \", \") AS biblionumbers, title, author FROM biblio GROUP BY CONCAT(title,\"\/\",author) HAVING COUNT(CONCAT(title,\"\/\",author))>1;"); #Get Stats. $dupes=0; $totalrecs=0; my @lines = split /\n/, $output; foreach my $line (@lines) { (my $nums, my $name) = split /\t/,$line; @bibs = split /,/, $nums; if(@bibs >1) #It is a dup if there are 2 of them. Skip the title line { $dupes++; $totalrecs+=@bibs; #count of matches } } $ToRemove = $totalrecs - $dupes; print " Duplicates to clean: $dupes\n"; print LOG " Duplicates to clean: $dupes\n"; print " Total Records involved: $totalrecs\n"; print LOG " Total Records involved: $totalrecs\n"; print " Records to remove: $ToRemove\n"; print LOG " Records to remove: $ToRemove\n"; #Now, we do the real work ONETIME: { foreach my $line (@lines) { #print "$line\n"; (my $nums, my $name) = split /\t/,$line; #print " $nums\n"; @bibs = split /,/, $nums; if(@bibs >1) #It is a dup if there are 2 of them. Skip the title line { my $max=0; my $maxitem="0000"; #find the biblio record with the largest size foreach my $bib (@bibs) { ($status, $out) = RunSQL("SELECT * FROM biblio_metadata WHERE biblionumber = $bib"); my $len = length($out); if($len >= $max) { $max = $len; $maxitem = $bib; } } #if we have a largest size, point everything to that and delete other records if($maxitem != "0000") { print LOG "---$name\n"; print LOG "Found Best Record: $maxitem\n"; my $sql; foreach my $bib (@bibs) { if($bib != $maxitem) #skip the one we are standardizing on { #update each table foreach my $table (@TABLELIST) { $sql="UPDATE $table SET biblionumber = $maxitem WHERE biblionumber=$bib;"; #$sql="SELECT biblionumber FROM $table WHERE biblionumber=$bib;"; (my $err, my $update)=RunSQL($sql); if($err != 0) {die $update}; } #Now we are ready to delete $sql = "DELETE from biblio where biblionumber = $bib;"; (my $err, my $update)=RunSQL($sql); if($err != 0) {die $update}; $counter++; print LOG " Deleting: $bib\n"; print "."; } } } if($doOne == 1) { last ONETIME; } } } } close(LOG); print "\n"; print "Completed purging $counter records.\n";