Copyright | © Herbert Valerio Riedel 2017 |
---|---|
License | BSD3 |
Maintainer | hvr@gnu.org |
Stability | stable |
Safe Haskell | Trustworthy |
Language | Haskell2010 |
Memory-efficient representation of Unicode text strings.
This module is intended to be imported
qualified
, to avoid name
clashes with
Prelude
functions, e.g.
import qualified Data.Text.Short as TS import qualified Data.Text.Short (ShortText)
This modules deliberately omits (common) partial functions, which can be found in Data.Text.Short.Partial instead.
Since: 0.1
Synopsis
- data ShortText
- empty :: ShortText
- singleton :: Char -> ShortText
- pack :: [ Char ] -> ShortText
- append :: ShortText -> ShortText -> ShortText
- concat :: [ ShortText ] -> ShortText
- cons :: Char -> ShortText -> ShortText
- snoc :: ShortText -> Char -> ShortText
- replicate :: Int -> ShortText -> ShortText
- unpack :: ShortText -> [ Char ]
- uncons :: ShortText -> Maybe ( Char , ShortText )
- unsnoc :: ShortText -> Maybe ( ShortText , Char )
- null :: ShortText -> Bool
- length :: ShortText -> Int
- isAscii :: ShortText -> Bool
- all :: ( Char -> Bool ) -> ShortText -> Bool
- any :: ( Char -> Bool ) -> ShortText -> Bool
- find :: ( Char -> Bool ) -> ShortText -> Maybe Char
- isPrefixOf :: ShortText -> ShortText -> Bool
- isSuffixOf :: ShortText -> ShortText -> Bool
- (!?) :: ShortText -> Int -> Maybe Char
- indexMaybe :: ShortText -> Int -> Maybe Char
- indexEndMaybe :: ShortText -> Int -> Maybe Char
- findIndex :: ( Char -> Bool ) -> ShortText -> Maybe Int
- take :: Int -> ShortText -> ShortText
- takeEnd :: Int -> ShortText -> ShortText
- drop :: Int -> ShortText -> ShortText
- dropEnd :: Int -> ShortText -> ShortText
- takeWhile :: ( Char -> Bool ) -> ShortText -> ShortText
- takeWhileEnd :: ( Char -> Bool ) -> ShortText -> ShortText
- dropWhile :: ( Char -> Bool ) -> ShortText -> ShortText
- dropWhileEnd :: ( Char -> Bool ) -> ShortText -> ShortText
- dropAround :: ( Char -> Bool ) -> ShortText -> ShortText
- splitAt :: Int -> ShortText -> ( ShortText , ShortText )
- splitAtEnd :: Int -> ShortText -> ( ShortText , ShortText )
- span :: ( Char -> Bool ) -> ShortText -> ( ShortText , ShortText )
- break :: ( Char -> Bool ) -> ShortText -> ( ShortText , ShortText )
- spanEnd :: ( Char -> Bool ) -> ShortText -> ( ShortText , ShortText )
- breakEnd :: ( Char -> Bool ) -> ShortText -> ( ShortText , ShortText )
- split :: ( Char -> Bool ) -> ShortText -> [ ShortText ]
- stripPrefix :: ShortText -> ShortText -> Maybe ShortText
- stripSuffix :: ShortText -> ShortText -> Maybe ShortText
- intersperse :: Char -> ShortText -> ShortText
- intercalate :: ShortText -> [ ShortText ] -> ShortText
- reverse :: ShortText -> ShortText
- filter :: ( Char -> Bool ) -> ShortText -> ShortText
- foldl :: (a -> Char -> a) -> a -> ShortText -> a
- foldl' :: (a -> Char -> a) -> a -> ShortText -> a
- foldr :: ( Char -> a -> a) -> a -> ShortText -> a
- fromString :: String -> ShortText
- toString :: ShortText -> String
- fromText :: Text -> ShortText
- toText :: ShortText -> Text
- fromShortByteString :: ShortByteString -> Maybe ShortText
- toShortByteString :: ShortText -> ShortByteString
- fromByteString :: ByteString -> Maybe ShortText
- toByteString :: ShortText -> ByteString
- toBuilder :: ShortText -> Builder
The
ShortText
type
A compact representation of Unicode strings.
A
ShortText
value is a sequence of Unicode scalar values, as defined in
§3.9, definition D76 of the Unicode 5.2 standard
;
This means that a
ShortText
is a list of (scalar) Unicode code-points (i.e. code-points in the range
[U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]
).
This type relates to
Text
as
ShortByteString
relates to
ByteString
by providing a more compact type. Please consult the documentation of
Data.ByteString.Short
for more information.
Currently, a boxed unshared
Text
has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 2 or 4 bytes per code-point (due to the internal UTF-16 representation). Each
Text
value which can share its payload with another
Text
requires only 4 words additionally. Unlike
ByteString
,
Text
use unpinned memory.
In comparison, the footprint of a boxed
ShortText
is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
It can be shown that for realistic data
UTF-16 has a space overhead of 50% over UTF-8
.
NOTE
: The
Typeable
instance isn't defined for GHC 7.8 (and older) prior to
text-short-0.1.3
Since: 0.1
Instances
IsList ShortText Source # |
Note
: Surrogate pairs (
Since: 0.1.2 |
Eq ShortText Source # | |
Data ShortText Source # |
It exposes a similar
Since: 0.1.3 |
Defined in Data.Text.Short.Internal gfoldl :: ( forall d b. Data d => c (d -> b) -> d -> c b) -> ( forall g. g -> c g) -> ShortText -> c ShortText Source # gunfold :: ( forall b r. Data b => c (b -> r) -> c r) -> ( forall r. r -> c r) -> Constr -> c ShortText Source # toConstr :: ShortText -> Constr Source # dataTypeOf :: ShortText -> DataType Source # dataCast1 :: Typeable t => ( forall d. Data d => c (t d)) -> Maybe (c ShortText ) Source # dataCast2 :: Typeable t => ( forall d e. ( Data d, Data e) => c (t d e)) -> Maybe (c ShortText ) Source # gmapT :: ( forall b. Data b => b -> b) -> ShortText -> ShortText Source # gmapQl :: (r -> r' -> r) -> r -> ( forall d. Data d => d -> r') -> ShortText -> r Source # gmapQr :: forall r r'. (r' -> r -> r) -> r -> ( forall d. Data d => d -> r') -> ShortText -> r Source # gmapQ :: ( forall d. Data d => d -> u) -> ShortText -> [u] Source # gmapQi :: Int -> ( forall d. Data d => d -> u) -> ShortText -> u Source # gmapM :: Monad m => ( forall d. Data d => d -> m d) -> ShortText -> m ShortText Source # gmapMp :: MonadPlus m => ( forall d. Data d => d -> m d) -> ShortText -> m ShortText Source # gmapMo :: MonadPlus m => ( forall d. Data d => d -> m d) -> ShortText -> m ShortText Source # |
|
Ord ShortText Source # | |
Defined in Data.Text.Short.Internal |
|
Read ShortText Source # | |
Show ShortText Source # | |
IsString ShortText Source # |
Note
: Surrogate pairs (
|
Defined in Data.Text.Short.Internal fromString :: String -> ShortText Source # |
|
Semigroup ShortText Source # | |
Monoid ShortText Source # | |
PrintfArg ShortText Source # |
Since: 0.1.2 |
Defined in Data.Text.Short.Internal formatArg :: ShortText -> FieldFormatter Source # |
|
Binary ShortText Source # | |
NFData ShortText Source # | |
Defined in Data.Text.Short.Internal |
|
Hashable ShortText Source # | |
Lift ShortText Source # |
Since 0.1.3 |
type Item ShortText Source # | |
Defined in Data.Text.Short.Internal |
Basic operations
Construction
singleton :: Char -> ShortText Source #
\(\mathcal{O}(1)\)
Construct
ShortText
from single codepoint.
singleton c == pack [c]
length (singleton c) == 1
>>>
singleton 'A'
"A"
>>>
map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
["\55295","\65533","\65533","\57344"]
Note : This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
Since: 0.1.2
pack :: [ Char ] -> ShortText Source #
\(\mathcal{O}(n)\)
Construct a
ShortText
from a list of
Char
s.
This is an alias for
fromString
.
Since: 0.1.2
snoc :: ShortText -> Char -> ShortText Source #
\(\mathcal{O}(n)\)
Append a character to the ond of a
ShortText
.
snoc t c == t <> singleton c
Since: 0.1.2
replicate :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n*m)\)
Replicate a
ShortText
.
A repetition count smaller than 1 results in an empty string result.
>>>
replicate 3 "jobs!"
"jobs!jobs!jobs!"
>>>
replicate 10000 ""
""
>>>
replicate 0 "nothing"
""
length (replicate n t) == max 0 n * length t
Since: 0.1.2
Deconstruction
Querying & predicates
null :: ShortText -> Bool Source #
\(\mathcal{O}(1)\)
Test whether a
ShortText
is empty.
>>>
null ""
True
null (singleton c) == False
null t == (length t == 0)
Since: 0.1
length :: ShortText -> Int Source #
\(\mathcal{O}(n)\)
Count the number of Unicode code-points in a
ShortText
.
>>>
length "abcd€"
5
>>>
length ""
0
length t >= 0
Since: 0.1
all :: ( Char -> Bool ) -> ShortText -> Bool Source #
\(\mathcal{O}(n)\)
Test whether
all
code points in
ShortText
satisfy a predicate.
>>>
all (const False) ""
True
>>>
all (> 'c') "abcdabcd"
False
>>>
all (/= 'c') "abdabd"
True
Since: 0.1.2
any :: ( Char -> Bool ) -> ShortText -> Bool Source #
\(\mathcal{O}(n)\)
Test whether
any
code points in
ShortText
satisfy a predicate.
>>>
any (> 'c') "abcdabcd"
True
>>>
any (const True) ""
False
>>>
any (== 'c') "abdabd"
False
any p t == not (all (not . p) t)
Since: 0.1.2
find :: ( Char -> Bool ) -> ShortText -> Maybe Char Source #
\(\mathcal{O}(n)\)
Return the left-most codepoint in
ShortText
that satisfies the given predicate.
>>>
find (> 'b') "abcdabcd"
Just 'c'
>>>
find (> 'b') "ababab"
Nothing
Since: 0.1.2
Lookup & indexing
(!?) :: ShortText -> Int -> Maybe Char Source #
\(\mathcal{O}(n)\)
Index
i
-th code-point in
ShortText
.
Infix operator alias of
indexMaybe
>>>
"abcdefg" !? 2
Just 'c'
Since: 0.1.2
findIndex :: ( Char -> Bool ) -> ShortText -> Maybe Int Source #
\(\mathcal{O}(n)\)
Return the index of the left-most codepoint in
ShortText
that satisfies the given predicate.
>>>
findIndex (> 'b') "abcdabcdef"
Just 2
>>>
findIndex (> 'b') "ababab"
Nothing
(indexMaybe t =<< findIndex p t) == find p t
Since: 0.1.2
Splitting
ShortText
s
Basic functions
take :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\)
Take prefix of given length or return whole
ShortText
if too short.
>>>
take 3 "abcdef"
"abc"
>>>
take 3 "ab"
"ab"
Since: 0.1.2
takeEnd :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\)
Take suffix of given length or return whole
ShortText
if too short.
>>>
takeEnd 3 "abcdefg"
"efg"
>>>
takeEnd 3 "ab"
"ab"
Since: 0.1.2
takeWhile :: ( Char -> Bool ) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take longest prefix satisfying given predicate.
takeWhile p t == fst (span p t)
>>>
takeWhile (< 'c') "abcdabcd"
"ab"
Since: 0.1.2
takeWhileEnd :: ( Char -> Bool ) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take longest suffix satisfying given predicate.
takeWhileEnd p t == snd (spanEnd p t)
>>>
takeWhileEnd (>= 'c') "abcdabcd"
"cd"
Since: 0.1.2
dropWhile :: ( Char -> Bool ) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Remove longest prefix satisfying given predicate.
dropWhile p t == snd (span p t)
>>>
dropWhile (< 'c') "abcdabcd"
"cdabcd"
Since: 0.1.2
dropWhileEnd :: ( Char -> Bool ) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Remove longest suffix satisfying given predicate.
dropWhileEnd p t == fst (spanEnd p t)
>>>
dropWhileEnd (>= 'c') "abcdabcd"
"abcdab"
Since: 0.1.2
dropAround :: ( Char -> Bool ) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\)
Strip characters from the beginning end and of
ShortText
which satisfy given predicate.
>>>
dropAround (== ' ') " white space "
"white space"
>>>
dropAround (> 'a') "bcdefghi"
""
Since: 0.1.2
Pair-valued functions
splitAt :: Int -> ShortText -> ( ShortText , ShortText ) Source #
\(\mathcal{O}(n)\)
Split
ShortText
into two halves.
returns a pair of
splitAt
n t
ShortText
with the following properties:
length (fst (splitAt n t)) == min (length t) (max 0 n)
fst (splitAt n t) <> snd (splitAt n t) == t
>>>
splitAt 2 "abcdef"
("ab","cdef")
>>>
splitAt 10 "abcdef"
("abcdef","")
>>>
splitAt (-1) "abcdef"
("","abcdef")
Since: 0.1.2
splitAtEnd :: Int -> ShortText -> ( ShortText , ShortText ) Source #
\(\mathcal{O}(n)\)
Split
ShortText
into two halves.
returns a pair of
splitAtEnd
n t
ShortText
with the following properties:
length (snd (splitAtEnd n t)) == min (length t) (max 0 n)
fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t
splitAtEnd n t == splitAt (length t - n) t
>>>
splitAtEnd 2 "abcdef"
("abcd","ef")
>>>
splitAtEnd 10 "abcdef"
("","abcdef")
>>>
splitAtEnd (-1) "abcdef"
("abcdef","")
Since: 0.1.2
span :: ( Char -> Bool ) -> ShortText -> ( ShortText , ShortText ) Source #
\(\mathcal{O}(n)\)
Split
ShortText
into longest prefix satisfying the given predicate and the remaining suffix.
>>>
span (< 'c') "abcdabcd"
("ab","cdabcd")
fst (span p t) <> snd (span p t) == t
Since: 0.1.2
break :: ( Char -> Bool ) -> ShortText -> ( ShortText , ShortText ) Source #
\(\mathcal{O}(n)\)
Variant of
span
with negated predicate.
>>>
break (> 'c') "abcdabcd"
("abc","dabcd")
break p t == span (not . p) t
fst (break p t) <> snd (break p t) == t
Since: 0.1.2
spanEnd :: ( Char -> Bool ) -> ShortText -> ( ShortText , ShortText ) Source #
\(\mathcal{O}(n)\)
Split
ShortText
into longest suffix satisfying the given predicate and the preceding prefix.
>>>
spanEnd (> 'c') "abcdabcd"
("abcdabc","d")
fst (spanEnd p t) <> snd (spanEnd p t) == t
Since: 0.1.2
breakEnd :: ( Char -> Bool ) -> ShortText -> ( ShortText , ShortText ) Source #
\(\mathcal{O}(n)\)
Variant of
spanEnd
with negated predicate.
>>>
breakEnd (< 'c') "abcdabcd"
("abcdab","cd")
breakEnd p t == spanEnd (not . p) t
fst (breakEnd p t) <> snd (breakEnd p t) == t
Since: 0.1.2
Breaking into many substrings
split :: ( Char -> Bool ) -> ShortText -> [ ShortText ] Source #
\(\mathcal{O}(n)\) Splits a string into components delimited by separators, where the predicate returns True for a separator element. The resulting components do not contain the separators. Two adjacent separators result in an empty component in the output. eg.
>>>
split (=='a') "aabbaca"
["","","bb","c",""]
>>>
split (=='a') ""
[""]
intercalate (singleton c) (split (== c) t) = t
NOTE
:
split
never returns an empty list to match the semantics of its counterpart from
Data.Text
.
Since: 0.1.3
Suffix & Prefix operations
Transformations
intersperse :: Char -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\)
Insert character between characters of
ShortText
.
>>>
intersperse '*' "_"
"_"
>>>
intersperse '*' "MASH"
"M*A*S*H"
Since: 0.1.2
reverse :: ShortText -> ShortText Source #
\(\mathcal{O}(n)\)
Reverse characters in
ShortText
.
>>>
reverse "star live desserts"
"stressed evil rats"
reverse (singleton c) == singleton c
reverse (reverse t) == t
Since: 0.1.2
filter :: ( Char -> Bool ) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\)
Remove characters from
ShortText
which don't satisfy given predicate.
>>>
filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!"
"Y dn't nd vwls t cnvy nfrmtn!"
filter (const False) t == ""
filter (const True) t == t
length (filter p t) <= length t
filter p t == pack [ c | c <- unpack t, p c ]
Since: 0.1.2
Folds
foldl :: (a -> Char -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\)
Reduces the characters of the
ShortText
with
the binary operator and an initial in forward direction (i.e. from
left to right).
>>>
foldl (\_ _ -> True) False ""
False
>>>
foldl (\s c -> c : s) ['.'] "abcd"
"dcba."
Since: 0.1.2
foldl' :: (a -> Char -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\)
Strict version of
foldl
.
Since: 0.1.2
foldr :: ( Char -> a -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\)
Reduces the characters of the
ShortText
with
the binary operator and an initial in reverse direction (i.e. from
right to left).
>>>
foldr (\_ _ -> True) False ""
False
>>>
foldr (:) ['.'] "abcd"
"abcd."
Since: 0.1.2
Conversions
String
fromString :: String -> ShortText Source #
\(\mathcal{O}(n)\)
Construct/pack from
String
>>>
fromString []
""
>>>
fromString ['a','b','c']
"abc"
>>>
fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
"\55295\65533\65533\57344"
Note : This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
Since: 0.1
toString :: ShortText -> String Source #
\(\mathcal{O}(n)\)
Convert to
String
(fromString . toString) t == t
Note
: See documentation of
fromString
for why
(
is not an identity function.
toString
.
fromString
)
Since: 0.1
Text
ByteString
fromShortByteString :: ShortByteString -> Maybe ShortText Source #
\(\mathcal{O}(n)\)
Construct
ShortText
from UTF-8 encoded
ShortByteString
This operation doesn't copy the input
ShortByteString
but it
cannot be
\(\mathcal{O}(1)\)
because we need to validate the UTF-8 encoding.
Returns
Nothing
in case of invalid UTF-8 encoding.
>>>
fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031A
Just "\NUL8\66330"
>>>
fromShortByteString "\xC0\x80" -- invalid denormalised U+00
Nothing
>>>
fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point)
Nothing
>>>
fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFF
Just "\1114111"
>>>
fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid)
Nothing
fromShortByteString (toShortByteString t) == Just t
Since: 0.1
toShortByteString :: ShortText -> ShortByteString Source #
\(\mathcal{O}(0)\)
Converts to UTF-8 encoded
ShortByteString
This operation has effectively no overhead, as it's currently merely a
newtype
-cast.
Since: 0.1
fromByteString :: ByteString -> Maybe ShortText Source #
\(\mathcal{O}(n)\)
Construct
ShortText
from UTF-8 encoded
ByteString
fromByteString
accepts (or rejects) the same input data as
fromShortByteString
.
Returns
Nothing
in case of invalid UTF-8 encoding.
Since: 0.1
toByteString :: ShortText -> ByteString Source #
\(\mathcal{O}(n)\)
Converts to UTF-8 encoded
ByteString
Since: 0.1