[Koha] Duplicate Biblio Records

Sat Jul 22 11:58:23 NZST 2017

I did make an automated script to deduplicate biblio records. Thanks, 
Joy, for the thoughts about choosing the "biggest" record. While it is 
dangerous to say it is done before the librarians have told me I did a 
great job, I will go ahead and put the script here just in case someone 
else finds it useful some day.

It does create a log file of all the biblio records it deletes. But, 
does not save the contents.  At least you can look at whatever remains 
and try to remember what the contents used to be...

My apologies to any Perl masters who might be offended by my attempt.  :)

  -----

#!/usr/bin/perl
#For use with Koha 17.05.x

$INSTANCE=""; #default instance
$KOHAMYSQL="koha-mysql"; #put a path here if this cannot be found
@TABLELIST=(aqorders, article_requests, biblioimages, items, hold_fill_targets,
             biblioitems, old_reserves, reserves, ratings, reviews, tags_all,
	    virtualshelfcontents);

my $doOne=0; #set to 1 to test
my $OutFile="DeDup_log.txt";

#####################################
# No need to edit below this line ###
#####################################
open(LOG,'>',$OutFile);
#read in the instance
if(@ARGV > 0) { $INSTANCE = $ARGV[0]; }
else { die "Specify a koha instance to dedup"; }

sub RunSQL
{
   my $sql = join ' ', @_;
   my $command = "echo '$sql' | $KOHAMYSQL $INSTANCE";
   my $error;
   my $out;
   print "Command: $command\n" if ($doOne == 1);

   $out = qx{$command 2>&1};
   $error=$?;
   return $error, $out;
}
my $counter=0;
#Find all the duplicates
print "Calculating: \n";
print LOG "Calculating: \n";
($status, $output) = RunSQL ("SELECT GROUP_CONCAT(biblionumber SEPARATOR \", \")
         AS biblionumbers, title,
         author FROM biblio GROUP BY CONCAT(title,\"\/\",author)
         HAVING COUNT(CONCAT(title,\"\/\",author))>1;");

#Get Stats.
$dupes=0;
$totalrecs=0;

my @lines = split /\n/, $output;
foreach my $line (@lines) {
     (my $nums, my $name) = split /\t/,$line;
     @bibs = split /,/, $nums;
     if(@bibs >1) #It is a dup if there are 2 of them.  Skip the title line
     {
	$dupes++;
	$totalrecs+=@bibs; #count of matches
     }
}
$ToRemove = $totalrecs - $dupes;
print "  Duplicates to clean: $dupes\n";
print LOG "  Duplicates to clean: $dupes\n";
print "  Total Records involved: $totalrecs\n";
print LOG "  Total Records involved: $totalrecs\n";
print "  Records to remove: $ToRemove\n";
print LOG "  Records to remove: $ToRemove\n";

#Now, we do the real work
ONETIME: {
     foreach my $line (@lines) {
	#print "$line\n";
	(my $nums, my $name) = split /\t/,$line;
	#print "  $nums\n";
	@bibs = split /,/, $nums;
	if(@bibs >1) #It is a dup if there are 2 of them.  Skip the title line
	{
	    my $max=0;
	    my $maxitem="0000";
	    #find the biblio record with the largest size
	    foreach my $bib (@bibs)
	    {
		($status, $out) = RunSQL("SELECT * FROM biblio_metadata WHERE biblionumber = $bib");
		my $len = length($out);
		if($len >= $max)
		{
		    $max = $len;
		    $maxitem = $bib;
		}
	    }

	    #if we have a largest size, point everything to that and delete other records
	    if($maxitem != "0000")
	    {
	        print LOG "---$name\n";
	        print LOG "Found Best Record: $maxitem\n";
		my $sql;
		foreach my $bib (@bibs)
		{
		    if($bib != $maxitem) #skip the one we are standardizing on
		    {
			#update each table
			foreach my $table (@TABLELIST)
			{
			    $sql="UPDATE $table SET biblionumber = $maxitem WHERE biblionumber=$bib;";
			    #$sql="SELECT biblionumber FROM $table WHERE biblionumber=$bib;";
			    (my $err, my $update)=RunSQL($sql);
			    if($err != 0)  {die $update};
			}
			#Now we are ready to delete
			$sql = "DELETE from biblio where biblionumber = $bib;";
			(my $err, my $update)=RunSQL($sql);
			if($err != 0)  {die $update};
			$counter++;
			print LOG "  Deleting: $bib\n";
			print ".";
		    }
		}
	    }
	    if($doOne == 1) { last ONETIME; }
	}
     }
}
close(LOG);
print "\n";

print "Completed purging $counter records.\n";