#!/usr/bin/perl
#                              -*- Mode: Perl -*-
# dirsplit ---
# Author           : Eduard Bloch ( blade@debian.org )
# Last Modified On : Sun, 06 Feb 2005 14:59:51 +0100
# Status           : Working, but use with caution!
# License: GPLv2

my $version="0.3.1";

require v5.8.1;
use strict;
use List::Util 'shuffle';
use Getopt::Long qw(:config no_ignore_case bundling);
use File::Basename;
use Cwd 'abs_path';

my $ret=0;
my $max="4488M";
my $prefix="vol_";
my $acc=500;
my $emode=1;
my $bsize=2048;
my $ofac =50;
my $opt_help;
my $opt_longhelp;
my $opt_sim;
my $opt_dir;
my $opt_flat;
my $opt_cor;
my $opt_move;
my $opt_ver;
my $opt_sln;
my $opt_ln;
my $opt_filter;

my $get_ver;

my $msg="
dirsplit [options] [advanced options] (directory|content-list-file)

 -H|--longhelp Show the long help message with more advanced options
 -n|--no-act   Only print the commands, no action (implies -v)
 -s|--size     NUMBER - Size of the medium (default: $max)
 -e|--expmode  NUMBER - directory exploration mode (recommended, see long help)
 -m|--move     Move files to target dirs (default: create mkisofs catalogs)
 -f|--flatdir  Flat dir mode, don't recreate directory structure
 -p|--prefix   STRING - first part of catalog/directory name (default: vol_)
 -c|--correct  Remove directory space summaries, eg. from du output
 -h|--help     Show this option summary
 -v|--verbose  More verbosity
                   
The complete help can be displayed with the --longhelp (-H) option.
The default mode is creating file catalogs useable with:
    mkisofs -D -r --joliet-long -graft-points -path-list CATALOG

Example:
dirsplit -m -s 700M -e4 random_data_to_backup/
";

my $msglong="
dirsplit [options] [advanced options] < directory | content-list-file >
 -n|--no-act   Only print the commands, no action (implies -v)
 -s|--size     NUMBER - Size of the medium (default: $max)
 -m|--move     Move files to target dirs (default: create mkisofs catalogs)
 -l|--symlink  similar to -m but just creates symlinks in the target dirs
 -L|--hardlink like -l but creates hardlinks
 -p|--prefix   STRING - first part of catalog/directory name (default: vol_)
 -f|--filter   EXPR - Filter expression, see examples below and perlre manpage
 --flat        Flat dir mode, don't recreate subdirectory structure (not recommended)
 -e|--expmode  NUMBER, special exploration modes, used with directory argument

  0: dumb file search with \"du -a\", file sizes are rounded up by du, every file treated as object. Slightly till very inaccurate, dependending on filesystem types and mkisofs options.
  1: (default) native exploration of the specified directory, but file sizes are rounded up to 2048 blocks plus estimated overhead for filenames (see -o option)
  2: like 1, but when single files _and_ directory found in the same directory somewhere, the files are treated as one object
  3: like 2, but don't coalesc when the size of the virtual object becomes too large for a medium size (currently $max)
  4: like 2, but the max. size of the virtual object built on files is limited to $max (create another after max. size)

 -b|--blksize  NUMBER, block size of the target filesystem (currently $bsize). Works in exploration mode.
 -o|--overhead NUMBER, overhead caused by directory entries (as factor for the filename length, default: 50, empiricaly found for Joliet+RR with not-so-deep directory structure). Works in exploration mode.
 -a|--accuracy NUMBER (1=faster, large number=better efficiency, default: 500)
 -c|--correct  Fix input data when it comes from du (KiB expansion, dupes/summaries removal)
 -h|--help     Show this option summary
 -v|--verbose  More verbosity
                   
The content list may be read from a file or from standard input (use -)
and contain lines with file sizes and file/directory names. File sizes
are expected to be in KiB, append modifier letters to recalculate them,
b for bytes, m for megabytes (10^6) or mebibytes (2^10). The default
mode is creating file catalogs useable with
    mkisofs -D -r --joliet-long -graft-points -path-list CATALOG

Examples:
dirsplit -m -s 120M -e4 largedirwithdata/ -p /zipmedia/backup_   #move stuff into splitted backup dirs
dirsplit -s 700M -e2 music/ # make mkisofs catalogs to burn all music to 700M CDRs, keep single files in each dir together
dirsplit -s 700M -e2 -f '/other\\/Soundtracks/' music/ # like above, only take files from other/Soundtracks
dirsplit -s 700M -e2 -f '!/Thumbs.db|Desktop.ini|\\.m3u\$/i' # like above, ignore some junk files and playlists, both letter cases

(old method:)
du -s mp3/Collections/Rock/* mp3/Singles/Pop/* | dirsplit -s 701M -

You should compare the required size of the created catalogs, as in_
for x in *list ; do mkisofs -quiet -D -r --joliet-long -graft-points -path-list \$x -print-size; done
with the media data (cdrecord -v -toc ...). dirsplit calculates very sharp but
without knowing the result in advance, so unexpected deep directory structures
may create additional overhead. Make sure you have some reserve capacity when
specifying the max media size.
";

my %options = (
   "h|help"                => \$opt_help,
   "d|dirhier"            => \$opt_dir,
   "flat"            => \$opt_flat,
   "f|filter=s"            => \$opt_filter,
   "e|expmode=i"            => \$emode,
   "o|overhead=i"            => \$ofac,
   "b|blksize=i"            => \$bsize,
   "n|no-act"            => \$opt_sim,
   "m|move"            => \$opt_move,
   "l|symlink"            => \$opt_sln,
   "L|hardlink"           => \$opt_ln,
   "v|verbose"            => \$opt_ver,
   "s|size=s"             => \$max,
   "p|prefix=s"              => \$prefix,
   "c|correct"               => \$opt_cor,
   "a|accuracy=i"            => \$acc,
   "H|longhelp"            => \$opt_longhelp,
   "version"                 => \$get_ver
);

die $msg unless ( GetOptions(%options));
if($opt_help) {
   print $msg;
   exit 0;
}
if($opt_longhelp) {
   print $msglong;
   exit 0;
}
if($get_ver) {
   print $version;
   exit 0;
}

# ignore the old dirhier setting since it is default now and disable the flag when opt_flat is specified
$opt_dir = !$opt_flat;

$opt_ver = 1 if $opt_sim;
$opt_move=1 if ($opt_sln || $opt_ln);

sub fixnr {
   # args: 
   # Number
   # optional: default multiplier
   my $fac;
   my $nr;
   if($_[0]=~/(\d+)(\D)/) {
      $nr=$1;
      $fac=$2;
   }
   elsif(defined($_[1])) {
      $nr=$_[0];
      $fac=$_[1];
   }
   else {
      return $_[0];
   }
   return $nr*1000000 if($fac eq "m");
   return $nr*1048576 if($fac eq "M");
   return $nr*1000 if($fac eq "k");
   return $nr*1024 if($fac eq "K");
   return $nr if($fac eq "b");
   die "$fac is not a valid multiplier!";
}

sub mkdirhier { 
   return 1 if($_[0] eq ".");
   return 1 if(-d $_[0] && -w $_[0]);
   return 0 if !mkdirhier(dirname($_[0]));
   return mkdir $_[0];
}

my $l;
my @in;
my %names;
my %coalesced; # this will contain arrays with coalesced files

# name to size
my %ntos;

my @indata;
my $inputdir;

$max=fixnr($max);
# about 400kB for iso headers
$max-=420000;

# parse du -s output
if(-f $ARGV[0] || (-f readlink($ARGV[0])) || $ARGV[0] eq "-") {
   die "Exploration mode argument is useless with pregenerated data, aborting...\n" if($emode);
   open($l, "<".$ARGV[0]);
   @indata=<$l>;
   &parseduinput;
}
elsif(-d $ARGV[0] || (-d readlink($ARGV[0]))) {
   if($emode) {
      #die "not implemented yet";
      $opt_cor=0;
      $inputdir=Cwd::abs_path($ARGV[0]);
      &explore($inputdir);
   }
   else {
      $opt_cor=1;
      @indata=`du -l -a $ARGV[0]/`;
      &parseduinput;
   }
}
else {
   die "Directory or contents listing needed!\n";
}

# recursive function
# parameter: directory
# mode 1: descend as far as possible and index all non-directories
# mode 2++:
# put all files of a dir into coaleseced-object, then descend into each dir
sub explore {
   (my $dir) = @_;
   my @stuff;
   my @dirs;
   my @files;

#   print "D: $dir\n";
   opendir(DIR, $dir) || die "Could not open $dir\n";
   while (my $f = readdir(DIR)) {
      next if ($f eq "." || $f eq "..");
      #print "\$f=$opt_filter;\n";
      
      $f="$dir/$f" if($dir ne ".");

      if(-d $f && !-l $f) {
         push(@dirs, $f);
      }
      else {
         if ($opt_filter) {
#            print "D:exp: \$f=$opt_filter;, Wert: ".eval("\$f=~$opt_filter;")."\n";
            if(eval("\$f=~$opt_filter;")) {
#            print "D: added $f\n";
               push(@files, $f);
            }
      }
         else {
#            print "D: added $f\n";
            push(@files, $f);
         }
      }
   }
   closedir(DIR);
   #print "D: $dir fertig",@dirs;

   if($#dirs < 0 && $#files < 0) {
      # this one is empty, register for cosmetics reason
      return if ($opt_filter && !eval("\$dir=~$opt_filter;"));
      $ntos{$dir}=getsize($dir);
      return;
   }
   explore($_) for(@dirs);

   if($emode==1) {
      $ntos{$_}=getsize($_) for(@files);
   }
   else {
      my $filesum=0;
      for(@files) {
         my $tmp=getsize($_);
         if($tmp>$max) {
            # already too large, stop right here
            die "Too large file ($_) for the given max size $max, aborting...\n";
         }
         $filesum += $tmp;
      };
      if($filesum>$max) {
         # too large coal. object...
         if($emode==3) {
            # don't coalesc in this mode, leave them alone
            $ntos{$_}=getsize($_) for(@files);
            return;
         }
         if($emode==4) {
#            print "D: mode 4!\n";
            # a bit complicated, split file set into coal.objects
            my $partsum=0;
            my @sorted=sort(@files);
            my @tmpvol;
            for(my $i=0;$i<=$#sorted;$i++) {
#            print "D: i: $i, partsum: $partsum, file: $sorted[$i]\n";
               my $tmp=getsize($sorted[$i]);
               $partsum+=$tmp;
               if($partsum>$max) {
                  # undo the last step then build the coal.object
                  $partsum-=$tmp;
                  $i--;

#                  print "D: coal: ".join(",", @tmpvol)."\n";
                  my $iname = ("### Coalesced file object, placeholder for the directory $dir up to file $sorted[$i] ###"." "x256);
                  @{$coalesced{$iname}} = @tmpvol;
                  $ntos{$iname}=$partsum;
                  # tmps reseten
                  undef @tmpvol;
                  undef $partsum;
               }
               else {
                  push(@tmpvol, $sorted[$i]);
               }
            }
            return;
         }
      }

      # be an invalid filename
      if($filesum) {
         my $iname = ("### Coalesced file object, placeholder for stuff in the directory $dir ###"." "x256);
         @{$coalesced{$iname}} = @files;
         $ntos{$iname}=$filesum;
      }
   }
}

sub getsize {
   (my $file) = @_;
   my $size = ((stat($file))[7]);
   my $rest = ($size % $bsize);
   $size = ($size + $bsize - $rest) if ($rest);
   return 1+int(200 + $ofac*length(basename($file)) + $size);
}
   

sub parseduinput {
   for(@indata) {
      chomp;
      if(/^(\w+)\s+(.+)/ && $2 ne "./") {
         #print "D: ntos $2 ist ".fixnr($1, "K")."\n";
         $ntos{$2}=fixnr($1, "K");
      }
   }
}

# sort and kill dupes/summaries
if($opt_cor) {
   my @intmp=sort(keys %ntos);
#   die join("\n", @intmp, "");
   my @newin;

   for(my $i=0;$i<=$#intmp;$i++) {
      $_=$intmp[$i];
      chomp;
      # weed out directory summaries
      # print "vgl. ". "$_/ ne substr(".$intmp[$i+1].",0,length($_)+1)) \n";
      /.*(.)$/;
      if ($1 ne "/" && "$_/" ne substr($intmp[$i+1],0,length($_)+1)) { 
         # feed the final data holders
         push(@in, $ntos{$_});
         # linked list behind the hash entry
         my $realname=$_;
         $realname=~s!^\./!!;
         push(@{$names{$ntos{$_}}}, $realname);
      }
   }
}
else {
   # copy around
   for(keys %ntos) {
#      print "wtf, $_, $ntos{$_}\n";
      push(@in, $ntos{$_});
      # linked list behind the hash entry
      push(@{$names{$ntos{$_}}}, $_);
   }
}

for(@in) {
   die "Too large object(s) ($_) for the given max size: ".join(", ",
   @{$names{$_}})."\n" if($_>$max);
}

$a=0;
for(@in) {$a+=$_};
$acc=1 if ($a <= $max); # just generate a list, more trials are pointless
print "\nSumm: $a\n" if($opt_ver);
die "Nothing to do!\n" if($a<4096); # looks like just an empty dir

my $i;
my @out;

# Parms: bin size (int), input array (arr reference), output array (arr reference)
# Returns: wasted space (int)
sub bp_bestfit {
   my $max=$_[0];
   my @in = @{$_[1]};
   my $target = $_[2];
   my @out;
   my @bel;

   my @tmp;
   push(@tmp,$in[0]);
   push(@out, \@tmp);
   $bel[0] = $in[0];
   shift @in;

   for(@in) {
      my $bestplace=$#out+1;
      my $bestwert=$max;
      for($i=0;$i<=$#out;$i++) {
         my $rest;
         $rest=$max-$bel[$i]-$_;
         if($rest>0 && $rest < $bestwert) {
            $bestplace=$i;
            $bestwert=$rest;
         };
      }
      if($bestplace>$#out) {
         my @bin;
         $bel[$bestplace]=$_;
         push(@bin, $_);
         push(@out,\@bin);
      }
      else{
         $bel[$bestplace]+=$_;
         push(  @{$out[$bestplace]}    , $_);
      }
   }
   my $ret=0;
   # count all rests but the last one
   for($i=0;$i<$#out;$i++) {
      $ret+=($max-$bel[$i]);
   }
   @{$target} = @out;
   return $ret;
}

# Parms: bin size (int), input array (arr reference), output array (arr reference)
# Returns: wasted space (int)
sub bp_firstfit {
   my $max=$_[0];
   my @in = @{$_[1]};
   my $target = $_[2];
   my @out;
   my @bel;

   piece: foreach my $obj (@in) {
      # first fit, use the first bin with enough free space
      for($i=0;$i<=$#out;$i++) {
         my $newsize=($bel[$i]+$obj);
#         print $bel[$i]."\n";
         if( $newsize <= $max ) {
#            print "F: bin$i: $bel[$i]+$obj=$newsize\n";
            #fits here
            $bel[$i]=$newsize;
            push(  @{$out[$i]} , $obj);
            next piece; # break
         }
      }
      # neues Bin
      my @bin;
      $bel[$i]=$obj;
#      print "N: bin$i: $bel[$i]=$obj\n";
      push(@bin, $obj);
      push(@out,\@bin);
   }
   my $ret=0;
   # sum up all rests except of the one from the last bin
   for($i=0;$i<$#out;$i++) {
#           print "hm, bel $i ist :".$bel[$i]." und res:".($max-$bel[$i])."\n";
      $ret+=($max-$bel[$i]);
   }
   @{$target} = @out;
#      print "wtf, ".join(",", @{$out[0]})."\n";
   return $ret;
}

my @erg;
my $globwaste=$max*($#in+1);
my $starttime=time;
for(1..$acc) {
   syswrite(STDOUT,".");
   my @tmp;
   #my $waste = bp_bestfit($max, \@in, \@tmp);
   my $waste = bp_firstfit($max, \@in, \@tmp);
   #print "D: waste - $waste\n";
   @in=shuffle(@in);
   if($waste < $globwaste) {
      $globwaste=$waste;
      @erg=@tmp;
   }
   if($starttime && time > $starttime+10) {
      syswrite(STDOUT,"\nSpent already over 10s (for $_ iterations)\nHint: reduce accuracy to make it faster!\n");
      undef $starttime;
   }
}
print "\nCalculated, using ".($#erg+1)." volumes.\n";
   
print "Wasted: $globwaste Byte (estimated, check mkisofs -print-size ...)\n";

# and the real work
$i=1;
my $inDirLen=length($inputdir);
for(@erg) {
   my $o;
   open($o, ">$prefix$i.list") if(! ($opt_move | $opt_sim));
   for(@{$_}) {
      my @stuff;
      my $object=shift(@{$names{$_}});
      if(-e $object) {
         @stuff=($object);
      }
      elsif(defined $coalesced{$object}) {
         @stuff=@{$coalesced{$object}};
      }
      else {
         print "Warning, $object not found, skipping...\n";
         $ret=1;
      }
      my $dirPrefix=dirname($prefix);
      my $prefixBase=basename($prefix);
      my $dirPrefixAbs=Cwd::abs_path($dirPrefix);
      for my $file (@stuff) {
         my $relFile=substr($file,$inDirLen+1);
         my $base=basename($relFile);
         if($opt_move) {
            my $targetsubdir = $dirPrefixAbs."/$prefixBase$i";
            $targetsubdir .= "/".dirname($relFile) if($opt_dir);
            print "$file -> $targetsubdir/$base\n" if($opt_ver);
            if(!$opt_sim) {
               mkdirhier $targetsubdir || die "Problems creating $targetsubdir\n";
               # last check
               die "Could not create $targetsubdir?\n" if(!(-d $targetsubdir && -w $targetsubdir));
               if($opt_sln) {
                  symlink($file, "$targetsubdir/$base");
               }
               elsif($opt_ln) {
                  if(-d $file && !-l $file) {
                     mkdir "$targetsubdir/$base";
                  }
                  else {
                     link($file, "$targetsubdir/$base");
                  }
               }
               else {
                  rename($file, "$targetsubdir/$base");
               }
            }
         }
         else {
            # escape = in mkisofs catalogs, they are used as separator
            my $isoname = ($opt_dir?$relFile : $base);
            $isoname=~s/=/\\=/g;
            my $sourcefile=$file;
            $sourcefile=~s/=/\\=/g;
            print "$i: /$isoname=$sourcefile\n" if $opt_ver;
            print $o "/$isoname=$sourcefile\n" if(!$opt_sim);
         }
      }
   }
   $i++;
   close($o) if($o);
}

exit $ret;
