words.js 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. var toString = require('./toString');
  2. /** Used to match non-compound words composed of alphanumeric characters. */
  3. var reBasicWord = /[a-zA-Z0-9]+/g;
  4. /** Used to compose unicode character classes. */
  5. var rsAstralRange = '\\ud800-\\udfff',
  6. rsComboMarksRange = '\\u0300-\\u036f\\ufe20-\\ufe23',
  7. rsComboSymbolsRange = '\\u20d0-\\u20f0',
  8. rsDingbatRange = '\\u2700-\\u27bf',
  9. rsLowerRange = 'a-z\\xdf-\\xf6\\xf8-\\xff',
  10. rsMathOpRange = '\\xac\\xb1\\xd7\\xf7',
  11. rsNonCharRange = '\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf',
  12. rsPunctuationRange = '\\u2000-\\u206f',
  13. rsSpaceRange = ' \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000',
  14. rsUpperRange = 'A-Z\\xc0-\\xd6\\xd8-\\xde',
  15. rsVarRange = '\\ufe0e\\ufe0f',
  16. rsBreakRange = rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange;
  17. /** Used to compose unicode capture groups. */
  18. var rsApos = "['\u2019]",
  19. rsBreak = '[' + rsBreakRange + ']',
  20. rsCombo = '[' + rsComboMarksRange + rsComboSymbolsRange + ']',
  21. rsDigits = '\\d+',
  22. rsDingbat = '[' + rsDingbatRange + ']',
  23. rsLower = '[' + rsLowerRange + ']',
  24. rsMisc = '[^' + rsAstralRange + rsBreakRange + rsDigits + rsDingbatRange + rsLowerRange + rsUpperRange + ']',
  25. rsFitz = '\\ud83c[\\udffb-\\udfff]',
  26. rsModifier = '(?:' + rsCombo + '|' + rsFitz + ')',
  27. rsNonAstral = '[^' + rsAstralRange + ']',
  28. rsRegional = '(?:\\ud83c[\\udde6-\\uddff]){2}',
  29. rsSurrPair = '[\\ud800-\\udbff][\\udc00-\\udfff]',
  30. rsUpper = '[' + rsUpperRange + ']',
  31. rsZWJ = '\\u200d';
  32. /** Used to compose unicode regexes. */
  33. var rsLowerMisc = '(?:' + rsLower + '|' + rsMisc + ')',
  34. rsUpperMisc = '(?:' + rsUpper + '|' + rsMisc + ')',
  35. rsOptLowerContr = '(?:' + rsApos + '(?:d|ll|m|re|s|t|ve))?',
  36. rsOptUpperContr = '(?:' + rsApos + '(?:D|LL|M|RE|S|T|VE))?',
  37. reOptMod = rsModifier + '?',
  38. rsOptVar = '[' + rsVarRange + ']?',
  39. rsOptJoin = '(?:' + rsZWJ + '(?:' + [rsNonAstral, rsRegional, rsSurrPair].join('|') + ')' + rsOptVar + reOptMod + ')*',
  40. rsSeq = rsOptVar + reOptMod + rsOptJoin,
  41. rsEmoji = '(?:' + [rsDingbat, rsRegional, rsSurrPair].join('|') + ')' + rsSeq;
  42. /** Used to match complex or compound words. */
  43. var reComplexWord = RegExp([
  44. rsUpper + '?' + rsLower + '+' + rsOptLowerContr + '(?=' + [rsBreak, rsUpper, '$'].join('|') + ')',
  45. rsUpperMisc + '+' + rsOptUpperContr + '(?=' + [rsBreak, rsUpper + rsLowerMisc, '$'].join('|') + ')',
  46. rsUpper + '?' + rsLowerMisc + '+' + rsOptLowerContr,
  47. rsUpper + '+' + rsOptUpperContr,
  48. rsDigits,
  49. rsEmoji
  50. ].join('|'), 'g');
  51. /** Used to detect strings that need a more robust regexp to match words. */
  52. var reHasComplexWord = /[a-z][A-Z]|[A-Z]{2,}[a-z]|[0-9][a-zA-Z]|[a-zA-Z][0-9]|[^a-zA-Z0-9 ]/;
  53. /**
  54. * Splits `string` into an array of its words.
  55. *
  56. * @static
  57. * @memberOf _
  58. * @since 3.0.0
  59. * @category String
  60. * @param {string} [string=''] The string to inspect.
  61. * @param {RegExp|string} [pattern] The pattern to match words.
  62. * @param- {Object} [guard] Enables use as an iteratee for methods like `_.map`.
  63. * @returns {Array} Returns the words of `string`.
  64. * @example
  65. *
  66. * _.words('fred, barney, & pebbles');
  67. * // => ['fred', 'barney', 'pebbles']
  68. *
  69. * _.words('fred, barney, & pebbles', /[^, ]+/g);
  70. * // => ['fred', 'barney', '&', 'pebbles']
  71. */
  72. function words(string, pattern, guard) {
  73. string = toString(string);
  74. pattern = guard ? undefined : pattern;
  75. if (pattern === undefined) {
  76. pattern = reHasComplexWord.test(string) ? reComplexWord : reBasicWord;
  77. }
  78. return string.match(pattern) || [];
  79. }
  80. module.exports = words;