| #include <pybind11/pybind11.h> |
| #include <pybind11/stl.h> |
| #include <queue> |
| #include <string> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
|
|
| bool is_whitespace(char ch) noexcept { |
| |
| return ch == 32 or (9 <= ch and ch <= 13); |
| |
| } |
|
|
| bool is_punctuation(char c) noexcept { |
| |
| |
| return (33 <= c and c <= 47) or (58 <= c and c <= 64) or |
| (91 <= c and c <= 96) or (123 <= c and c <= 126); |
| } |
|
|
| |
| |
| std::vector<std::string> clean_ngram(std::string const &input, |
| std::string const &ignore, |
| size_t ngram_n) noexcept { |
|
|
| size_t num_grams = 0; |
| std::vector<std::string> ngram_list; |
| std::vector<uint8_t> gram_lengths; |
| std::string current_ngram; |
|
|
| |
| current_ngram.reserve(11 * ngram_n); |
| gram_lengths.reserve(ngram_n); |
|
|
| bool started_gram = false; |
| gram_lengths.push_back(0); |
|
|
| |
| |
| for (auto iter = input.begin(); iter != input.end(); iter++) { |
|
|
| |
| |
| |
| if (is_whitespace(*iter) || gram_lengths.back() > 10) { |
|
|
| |
| while (++iter != input.end() && is_whitespace(*iter)) |
| ; |
| iter--; |
|
|
| if (started_gram) { |
| num_grams += 1; |
|
|
| |
| if (ngram_n == 1) { |
| ngram_list.push_back(current_ngram); |
| current_ngram = current_ngram.substr(gram_lengths.front()); |
| gram_lengths.back() = 0; |
|
|
| |
| } else if (num_grams >= ngram_n) { |
| |
| ngram_list.push_back(current_ngram); |
|
|
| |
| |
| current_ngram = current_ngram.substr(gram_lengths.front() + 1); |
| current_ngram += ' '; |
|
|
| |
| |
| gram_lengths.erase(gram_lengths.begin()); |
| gram_lengths.push_back(0); |
|
|
| |
| } else { |
| current_ngram += ' '; |
| gram_lengths.push_back(0); |
| } |
|
|
| started_gram = false; |
| } |
|
|
| |
| |
| |
| } else if (ignore.find(*iter) != std::string::npos) { |
| continue; |
| } |
|
|
| |
| |
| else { |
| current_ngram += tolower(*iter); |
| gram_lengths.back() += 1; |
| started_gram = true; |
| } |
| } |
|
|
| return ngram_list; |
| } |
|
|
| |
| |
| |
| std::vector<std::tuple<std::string, size_t, size_t>> |
| clean_ngram_with_indices(std::string const &input, std::string const &ignore, |
| size_t ngram_n) noexcept { |
|
|
| size_t num_grams = 0; |
| std::vector<std::tuple<std::string, size_t, size_t>> ngram_list; |
| std::vector<uint8_t> gram_lengths; |
| std::vector<size_t> gram_start_indices; |
| std::string current_ngram; |
|
|
| |
| current_ngram.reserve(11 * ngram_n); |
|
|
| bool started_gram = false; |
| gram_lengths.push_back(0); |
| gram_start_indices.push_back(0); |
|
|
| for (size_t i = 0; i < input.length(); i++) { |
| char ch = input[i]; |
|
|
| |
| if (is_whitespace(ch) || gram_lengths.back() > 10) { |
|
|
| |
| while (++i < input.length() && is_whitespace(input[i])) |
| ; |
| i--; |
|
|
| if (started_gram) { |
| num_grams += 1; |
|
|
| |
| if (ngram_n == 1) { |
| ngram_list.push_back( |
| std::make_tuple(current_ngram, gram_start_indices.front(), i)); |
| current_ngram = current_ngram.substr(gram_lengths.front()); |
| gram_lengths.back() = 0; |
| gram_start_indices.back() = i + 1; |
|
|
| |
| } else if (num_grams >= ngram_n) { |
|
|
| |
| ngram_list.push_back( |
| std::make_tuple(current_ngram, gram_start_indices.front(), i)); |
|
|
| |
| |
| current_ngram = current_ngram.substr(gram_lengths.front() + 1); |
| current_ngram += ' '; |
|
|
| |
| |
| gram_lengths.erase(gram_lengths.begin()); |
| gram_lengths.push_back(0); |
|
|
| gram_start_indices.erase(gram_start_indices.begin()); |
| gram_start_indices.push_back(i + 1); |
|
|
| |
| } else { |
| current_ngram += ' '; |
| gram_lengths.push_back(0); |
| gram_start_indices.push_back(i + 1); |
| } |
|
|
| started_gram = false; |
| } |
|
|
| |
| } else if (ignore.find(ch) != std::string::npos) { |
| continue; |
|
|
| |
| |
| } else { |
| current_ngram += tolower(ch); |
| gram_lengths.back() += 1; |
| started_gram = true; |
| } |
| } |
|
|
| return ngram_list; |
| } |
|
|
| PYBIND11_MODULE(janitor_util, m) { |
| m.doc() = "pybind11 example plugin"; |
| |
| |
| m.def("clean_ngram", &clean_ngram, |
| "Create ngrams of words, ignoring some characters"); |
| m.def("clean_ngram_with_indices", &clean_ngram_with_indices, |
| "Create ngrams of words with indices, ignoring some characters"); |
| } |
|
|
| |
| |
| |
| |
| |
|
|