A script to convert a file to utf-8 charset with autodetect source file encoding.
It also skips the file if it’s newer than 1 minute (just in case the download is not finished).
The uchardet command is required.
tee /usr/local/bin/charconv2utf8.sh <<EOF
#!/bin/bash
if ! command -v uchardet &> /dev/null
then
echo "uchardet could not be found"
exit
fi
if [[ $# -eq 0 ]]
then
echo "Syntax: $0 path/to/file"
exit 1
else
p=$1
fi
test -f $p || echo "File $p not found"
test -f $p || exit 1
f=$(basename $p)
cd $(dirname $p)
# skip if it's newer than 1 minute (just in case a ftp download is not terminated)
find . -maxdepth 1 -mmin +1 -name "${f}" | grep "${f}" > /dev/null
if [[ $? -ne 0 ]]
then
echo "The file ${f} is too recent"
exit
fi
# if already UTF-8 skips
uchardet ${f} | grep -i UTF-8 > /dev/null
if [[ $? -eq 0 ]]
then
echo "The file ${f} is already UTF-8"
exit
fi
# detect and check from-encoding
enc=$(uchardet ${f})
iconv -l | grep -i $enc
if [[ $? -ne 0 ]]
then
echo "Source file encoding '${enc}' not found"
exit 1
fi
# proceed to convert to UTF-8
echo "Proceeding to convert ${f} from ${enc} to utf-8"
iconv -f ${enc} -t UTF-8 ${f} > ${f}.new
mv ${f}.new ${f}
EOF
chmod +x /usr/local/bin/charconv2utf8.sh
Leave a Reply