Source code for karld.unicode_io

# -*- coding: utf-8 -*-
import sys
import csv
import codecs

from karld import is_py3

try:
    from cStringIO import StringIO
except ImportError:
    from io import StringIO

from functools import partial
try:
    from itertools import imap
except ImportError:
    imap = map

from operator import methodcaller

__all__ = ['csv_reader',
           'get_csv_row_writer',
           'csv_unicode_reader']

#Unicode IO

__doc__ = """
How To Encoding
=================
If you've tried something like ``unicode('က')`` or  ``u'hello ' + 'wကrld'
or ``str(u'wörld')`` you will have seen UnicodeDecodeError
and UnicodeEncodeError. Likely, you've tried to
read csv data from a file and mixed the data with unicode
and everything went fine until it got to the line with
some word with an accent character and it broke and showed
``UnicodeDecodeError: 'ascii' codec can't decode byte ...``
What do you do?.
You've tried to write sequences of unicode strings
to a csv file and gotten
``UnicodeEncodeError: 'ascii' codec can't encode character u'\\xf6' in position 1: ordinal not in range(128)``
What do you do?

Unicode handles characters used by different languages
around the world, emojis, curly quotes and other *glyphs*.
The textual data in different parts of the world
can have various encodings designed to specifically
handle their glyphs and unicode can represent them all,
but the data must be decoded from that encoding to unicode.

The data was written to the file in a specific encoding,
either deliberately or because that was the default for
the software. Unfortunately, it's up to the reader of the
data to know what the data was encoded in. It can be
connected to the language or locale it was created in.
Sometimes it can be inferred by the data. Many times
it's written in utf-8, which can handle encoding all
the different chars that can be in a unicode string.
It does this by saving chars like ``'¥'``, or in unicode, ``u'\\xa5'``,
as ``'\\xc2\\xa5'``.  ``u'\\xa5'.encode('utf-8')`` results in ``'\\xc2\\xa5'``.
It uses more space, but can do it. By the way, ``'¥'``
is possible in this code because the encoding is declared
at the top of this file.

String transformation methods, such as upper() or lower()
don't work on these chars, like ``'î'`` or ``'ê'`` if they are
encoded as a utf-8 string, but will work if they are
decoded from utf-8 to unicode.

>>> print 'î'.upper()
î
>>> print u'î'.upper()
Î
>>> print 'ê'.upper()
ê
>>> print 'ê'.decode('utf-8').upper()
Ê

The python 2.7 csv module doesn't work with unicode,
so the text it parses must be encoded from unicode
to a str using an encoding that will handle all the
chars in the text. utf-8 is good choice, and thus is
default.

The purpose of this module is to facilitate reading
and writing csv data in whatever encoding your data
is in.
"""

encode_utf8 = methodcaller('encode', "utf-8")
decode_utf8 = methodcaller('decode', "utf-8")


def not_implemented(*args, **kwargs):
    raise NotImplementedError()


if is_py3():
    unicode = str
    decode_utf8_to_unicode = not_implemented
    map_decode_utf8_to_unicode = not_implemented
else:
    decode_utf8_to_unicode = partial(unicode, encoding="utf-8")
    map_decode_utf8_to_unicode = partial(map, decode_utf8_to_unicode)


[docs]def csv_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs): """ Generator the reads serialized unicode csv data. Use this if you have a stream of data in unicode and you want to access the rows of the data as sequences encoded as unicode. Unicode in, unicode out. :param unicode_csv_data: An iterable of unicode strings. :param dialect: csv dialect """ if is_py3(): return csv.reader(unicode_csv_data, dialect=dialect, **kwargs) else: encoded_utf8_data = imap(encode_utf8, unicode_csv_data) reader = csv.reader(encoded_utf8_data, dialect=dialect, **kwargs) return imap(map_decode_utf8_to_unicode, reader)
unicode_csv_unicode_reader = csv_unicode_reader def _utf8_iter_recoder(stream, encoding): """Generator re-encodes input file's lines from a given encoding to utf-8. :param stream: file handle. :param encoding: str of encoding. """ return codecs.iterencode(codecs.iterdecode(stream, encoding), "utf-8")
[docs]def csv_reader(csv_data, dialect=csv.excel, encoding="utf-8", **kwargs): """ Csv row generator that re-encodes to unicode from csv data with a given encoding. Utf-8 data in, unicode out. You may specify a different encoding of the incoming data. :param csv_data: An iterable of str of the specified encoding. :param dialect: csv dialect :param encoding: The encoding of the given data. """ if is_py3(): return csv.reader(csv_data, dialect=csv.excel, **kwargs) reader = csv.reader( _utf8_iter_recoder(csv_data, encoding), dialect=dialect, **kwargs ) return imap(map_decode_utf8_to_unicode, reader)
csv_to_unicode_reader = csv_reader def _encode_unicode_or_identity(value): """ Encode a value to utf-8 only if it's unicode. """ if isinstance(value, unicode): return encode_utf8(value) return value def _encode_write_row(stream, queue, writer, encoder, row): """ Write a row, of unicode data to a cStringIO.StringIO then get the csv row value from the queue and decode from utf-8 to unicode, then to the target encoding and write to the stream. """ writer.writerow(map(_encode_unicode_or_identity, row)) stream.write( encoder.encode( decode_utf8( queue.getvalue() ) ) ) # empty queue queue.truncate(0)
[docs]def get_csv_row_writer(stream, dialect=csv.excel, encoding="utf-8", **kwargs): """ Create a csv, encoding from unicode, row writer. Use returned callable to write rows of unicode data to a stream, such as a file opened in write mode, in utf-8(or another) encoding. :: my_row_data = [ [u'one', u'two'], [u'three', u'four'], ] with open('myfile.csv', 'wt') as myfile: unicode_row_writer = get_unicode_row_writer(myfile) for row in my_row_data: unicode_row_writer(row) """ if is_py3(): writer = csv.writer(stream, dialect=dialect, **kwargs) return writer.writerow else: queue = StringIO() writer = csv.writer(queue, dialect=dialect, **kwargs) encoder = codecs.getincrementalencoder(encoding)() return partial(_encode_write_row, stream, queue, writer, encoder)
get_unicode_row_writer = get_csv_row_writer