#!/bin/bash
# Copyright 2018 Jarvan Wang
# Copyright 2017 Vimal Manohar
# Apache 2.0.
cmd=run.pl
nj=40
if [ -f ./path.sh ]; then . ./path.sh; fi
. ./utils/parse_options.sh
if [ $# -ne 3 ]; then
cat <<EOF
This script creates an lattice directory containing a subset of
utterances contained in <subset-data-dir> from the
original lattice directory containing lattices for utterances in
<full-data-dir>.
The number of split jobs in the output lattice directory is
equal to the number of jobs in the original lattice directory,
unless the subset data directory has too few speakers.
Usage: $0 [options] <subset-data-dir> <lat-dir> <subset-lat-dir>
e.g.: $0 data/train exp/tri3_lat_sp exp/tri3_lat
Options:
--cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
EOF
exit 1
fi
subset_data=$1
lat_dir=$2
dir=$3
ori_nj=$(cat $lat_dir/num_jobs) || exit 1
mkdir -p $dir
cp $lat_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true
cp -r $lat_dir/phones $dir 2>/dev/null || true
$cmd JOB=1:$ori_nj $dir/log/copy_lattices.JOB.log \
lattice-copy "ark:gunzip -c $lat_dir/lat.JOB.gz |" \
ark,scp:$dir/lat_tmp.JOB.ark,$dir/lat_tmp.JOB.scp || exit 1
for n in `seq $ori_nj`; do
cat $dir/lat_tmp.$n.scp
done > $dir/lat_tmp.scp
#awk '{hash[$1]=$2}END{for(key in hash){printf("%s %s\n",key,hash[key])}}' < $dir/lat_tmp.scp > $dir/lat_tmp_sorted_uniq.scp
mv $dir/lat_tmp.scp $dir/lat_tmp.scp.bak
perl -e 'my %hash;while(<>){chomp;($key,$ark)=split;$hash{$key}=$ark};for $key (sort keys %hash){printf("%s %s\n",$key,$hash{$key})}' $dir/lat_tmp.scp.bak > $dir/lat_tmp.scp
utils/split_data.sh $subset_data $nj
$cmd JOB=1:$nj $dir/log/filter_lattices.JOB.log \
lattice-copy \
"scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/lat_tmp.scp |" \
"ark:| gzip -c > $dir/lat.JOB.gz" || exit 1
echo $nj > $dir/num_jobs
#rm $dir/lat_tmp.*.{ark,scp} $dir/lat_tmp.scp
exit 0