/* CFStringTokenizer.h Copyright (c) 2006-2012, Apple Inc. All rights reserved. */ /*! @header CFStringTokenizer @abstract A CFStringTokenizer object provides the means by which you can tokenize a string. To find a token that includes the character specified by character index and set it as the current token, you call CFStringTokenizerGoToTokenAtIndex. To advance to the next token and set it as the current token, you call CFStringTokenizerAdvanceToNextToken. To get the range of current token, you call CFStringTokenizerGetCurrentTokenRange. You can use CFStringTokenizerCopyCurrentTokenAttribute to get the attribute of current token. If current token is a compound, you can call CFStringTokenizerGetCurrentSubTokens to retrieve the subtokens or derived subtokens contained in the compound token. To guess the language of a string, you call CFStringTokenizerCopyBestStringLanguage. */ #if !defined(__COREFOUNDATION_CFSTRINGTOKENIZER__) #define __COREFOUNDATION_CFSTRINGTOKENIZER__ 1 #include #include #include CF_IMPLICIT_BRIDGING_ENABLED CF_EXTERN_C_BEGIN /* =================================================================== Language Identifier =================================================================== */ /*! @function CFStringTokenizerCopyBestStringLanguage @abstract Guesses the language of a string and returns the BCP 47 string of the language. @param string The string whose language is to be guessed. @param range The range of characters in string whose language to be guessed. The specified range must not exceed the bounds of the string. If the range is empty (length 0), the first few hundred characters in the string are used. @result A language represented in BCP 47 string, or NULL if the language of the string cannot be guessed. @discussion The result is not guaranteed to be accurate. Typically 200-400 characters are required to reliably guess the language of a string. */ CF_EXPORT CFStringRef CFStringTokenizerCopyBestStringLanguage(CFStringRef string, CFRange range) CF_AVAILABLE(10_5, 3_0); /* =================================================================== Tokenization =================================================================== */ typedef struct __CFStringTokenizer * CFStringTokenizerRef; /*! Tokenization options */ enum { /*! Tokenization Unit Use one of tokenization unit options with CFStringTokenizerCreate to specify how the string should be tokenized. */ /* kCFStringTokenizerUnitWord is not locale sensitive. It doesn't return space between words as a token. */ kCFStringTokenizerUnitWord = 0, kCFStringTokenizerUnitSentence = 1, kCFStringTokenizerUnitParagraph = 2, kCFStringTokenizerUnitLineBreak = 3, /* kCFStringTokenizerUnitWordBoundary can be used in double click detection and whole word search. It is locale sensitive. If the locale parameter of CFStringTokenizerCreate is NULL, default locale is used. kCFStringTokenizerUnitWordBoundary returns space between words as a token. */ kCFStringTokenizerUnitWordBoundary = 4, /*! Attribute Specifier Use attribute specifier to tell tokenizer to prepare the specified attribute when it tokenizes the given string. The attribute value can be retrieved by calling CFStringTokenizerCopyCurrentTokenAttribute with one of the attribute option. */ /* Latin Transcription. Used with kCFStringTokenizerUnitWord or kCFStringTokenizerUnitWordBoundary */ kCFStringTokenizerAttributeLatinTranscription = 1UL << 16, /* Language in BCP 47 string. Used with kCFStringTokenizerUnitSentence or kCFStringTokenizerUnitParagraph. */ kCFStringTokenizerAttributeLanguage = 1UL << 17, }; /*! Token type CFStringTokenizerGoToTokenAtIndex / CFStringTokenizerAdvanceToNextToken returns the type of current token. */ typedef CF_OPTIONS(CFOptionFlags, CFStringTokenizerTokenType) { /* Have no token. */ kCFStringTokenizerTokenNone = 0, /* Normal token */ kCFStringTokenizerTokenNormal = 1UL << 0, /*! Compound token which may contain subtokens but with no derived subtokens. Its subtokens can be obtained by calling CFStringTokenizerGetCurrentSubTokens. */ kCFStringTokenizerTokenHasSubTokensMask = 1UL << 1, /*! Compound token which may contain derived subtokens. Its subtokens and derived subtokens can be obtained by calling CFStringTokenizerGetCurrentSubTokens. */ kCFStringTokenizerTokenHasDerivedSubTokensMask = 1UL << 2, kCFStringTokenizerTokenHasHasNumbersMask = 1UL << 3, kCFStringTokenizerTokenHasNonLettersMask = 1UL << 4, kCFStringTokenizerTokenIsCJWordMask = 1UL << 5 }; /*! @function CFStringTokenizerGetTypeID @abstract Get the type identifier. @result the type identifier of all CFStringTokenizer instances. */ CF_EXPORT CFTypeID CFStringTokenizerGetTypeID(void) CF_AVAILABLE(10_5, 3_0); /*! @function CFStringTokenizerCreate @abstract Creates a tokenizer instance. @param alloc The CFAllocator which should be used to allocate memory for the tokenizer and its storage for values. This parameter may be NULL in which case the current default CFAllocator is used. @param string The string to tokenize. @param range The range of characters within the string to be tokenized. The specified range must not exceed the length of the string. @param options Use one of the Tokenization Unit options to specify how the string should be tokenized. Optionally specify one or more attribute specifiers to tell the tokenizer to prepare specified attributes when it tokenizes the string. @param locale The locale to specify language or region specific behavior. Pass NULL if you want tokenizer to identify the locale automatically. @result A reference to the new CFStringTokenizer. */ CF_EXPORT CFStringTokenizerRef CFStringTokenizerCreate(CFAllocatorRef alloc, CFStringRef string, CFRange range, CFOptionFlags options, CFLocaleRef locale) CF_AVAILABLE(10_5, 3_0); /*! @function CFStringTokenizerSetString @abstract Set the string to tokenize. @param tokenizer The reference to CFStringTokenizer returned by CFStringTokenizerCreate. @param string The string to tokenize. @param range The range of characters within the string to be tokenized. The specified range must not exceed the length of the string. */ CF_EXPORT void CFStringTokenizerSetString(CFStringTokenizerRef tokenizer, CFStringRef string, CFRange range) CF_AVAILABLE(10_5, 3_0); /*! @function CFStringTokenizerGoToTokenAtIndex @abstract Random access to a token. Find a token that includes the character specified by character index, and set it as the current token. @param tokenizer The reference to CFStringTokenizer returned by CFStringTokenizerCreate. @param index The index of the Unicode character in the CFString. @result Type of the token if succeeded in finding a token and setting it as current token. kCFStringTokenizerTokenNone if failed in finding a token. @discussion The range and attribute of the token can be obtained by calling CFStringTokenizerGetCurrentTokenRange and CFStringTokenizerCopyCurrentTokenAttribute. If the token is a compound (with type kCFStringTokenizerTokenHasSubTokensMask or kCFStringTokenizerTokenHasDerivedSubTokensMask), its subtokens and (or) derived subtokens can be obtained by calling CFStringTokenizerGetCurrentSubTokens. */ CF_EXPORT CFStringTokenizerTokenType CFStringTokenizerGoToTokenAtIndex(CFStringTokenizerRef tokenizer, CFIndex index) CF_AVAILABLE(10_5, 3_0); /*! @function CFStringTokenizerAdvanceToNextToken @abstract Token enumerator. @param tokenizer The reference to CFStringTokenizer returned by CFStringTokenizerCreate. @result Type of the token if succeeded in finding a token and setting it as current token. kCFStringTokenizerTokenNone if failed in finding a token. @discussion If there is no preceding call to CFStringTokenizerGoToTokenAtIndex or CFStringTokenizerAdvanceToNextToken, it finds the first token in the range specified to CFStringTokenizerCreate. If there is a current token after successful call to CFStringTokenizerGoToTokenAtIndex or CFStringTokenizerAdvanceToNextToken, it proceeds to the next token. If succeeded in finding a token, set it as current token and return its token type. Otherwise invalidate current token and return kCFStringTokenizerTokenNone. The range and attribute of the token can be obtained by calling CFStringTokenizerGetCurrentTokenRange and CFStringTokenizerCopyCurrentTokenAttribute. If the token is a compound (with type kCFStringTokenizerTokenHasSubTokensMask or kCFStringTokenizerTokenHasDerivedSubTokensMask), its subtokens and (or) derived subtokens can be obtained by calling CFStringTokenizerGetCurrentSubTokens. */ CF_EXPORT CFStringTokenizerTokenType CFStringTokenizerAdvanceToNextToken(CFStringTokenizerRef tokenizer) CF_AVAILABLE(10_5, 3_0); /*! @function CFStringTokenizerGetCurrentTokenRange @abstract Returns the range of current token. @param tokenizer The reference to CFStringTokenizer returned by CFStringTokenizerCreate. @result Range of current token, or {kCFNotFound,0} if there is no current token. */ CF_EXPORT CFRange CFStringTokenizerGetCurrentTokenRange(CFStringTokenizerRef tokenizer) CF_AVAILABLE(10_5, 3_0); /*! @function CFStringTokenizerCopyCurrentTokenAttribute @abstract Copies the specified attribute of current token. @param tokenizer The reference to CFStringTokenizer returned by CFStringTokenizerCreate. @param attribute Specify a token attribute you want to obtain. The value is one of kCFStringTokenizerAttributeLatinTranscription or kCFStringTokenizerAttributeLanguage. @result Token attribute, or NULL if current token does not have the specified attribute or if there is no current token. */ CF_EXPORT CFTypeRef CFStringTokenizerCopyCurrentTokenAttribute(CFStringTokenizerRef tokenizer, CFOptionFlags attribute) CF_AVAILABLE(10_5, 3_0); /*! @function CFStringTokenizerGetCurrentSubTokens @abstract Retrieves the subtokens or derived subtokens contained in the compound token. @param tokenizer The reference to CFStringTokenizer returned by CFStringTokenizerCreate. @param ranges An array of CFRange to fill in with the ranges of subtokens. The filled in ranges are relative to the string specified to CFStringTokenizerCreate. This parameter can be NULL. @param maxRangeLength The maximum number of ranges to return. @param derivedSubTokens An array of CFMutableArray to which the derived subtokens are to be added. This parameter can be NULL. @result number of subtokens. @discussion If token type is kCFStringTokenizerTokenNone, the ranges array and derivedSubTokens array are untouched and the return value is 0. If token type is kCFStringTokenizerTokenNormal, the ranges array has one item filled in with the entire range of the token (if maxRangeLength >= 1) and a string taken from the entire token range is added to the derivedSubTokens array and the return value is 1. If token type is kCFStringTokenizerTokenHasSubTokensMask or kCFStringTokenizerTokenHasDerivedSubTokensMask, the ranges array is filled in with as many items as there are subtokens (up to a limit of maxRangeLength). The derivedSubTokens array will have sub tokens added even when the sub token is a substring of the token. If token type is kCFStringTokenizerTokenHasSubTokensMask, the ordinary non-derived subtokens are added to the derivedSubTokens array. */ CF_EXPORT CFIndex CFStringTokenizerGetCurrentSubTokens(CFStringTokenizerRef tokenizer, CFRange *ranges, CFIndex maxRangeLength, CFMutableArrayRef derivedSubTokens) CF_AVAILABLE(10_5, 3_0); CF_EXTERN_C_END CF_IMPLICIT_BRIDGING_DISABLED #endif /* ! __COREFOUNDATION_CFSTRINGTOKENIZER__ */