This article reconstructs the underlying code of the string class from the perspective of data structures, including array-based sequential implementation and block list-based linked implementation. It also provides the usage of related functions in the string library. Subsequently, the article explains the classic string matching algorithm: the KMP algorithm and its applications, and finally introduces the concept of regular expressions.
/* This is the implementation of hand-made string */ #include<stdexcept> #include<iostream> #include<algorithm>//just for the min function #include<cstring>
//overload of index operator char seqString::operator[](int index) const{ if(index < 0 || index >= len){ throw std::runtime_error("Invalid index!"); }else{ return data[index]; } }
//insert a string in the certain position voidseqString::insert(int start, const seqString& other, int insertsize){ if(insertsize == -1){ //This means insert the whole size of other insertsize = other.length(); }
//remove a substring from a certain position voidseqString::remove(int deletelength,int start ){ if(start == -1){ //then delete the last leng characters from the back start = len - deletelength; //len represents the length of the whole string; }
if(start + deletelength >= len){ //delete all the characters after the start index data [start] = '\0'; len = start; char * tmp = data; //recycle the memory data = newchar [len+1]; for(int i = 0; i<= len; i++){ data[i] = tmp[i]; } delete[] tmp; }else{ len -= deletelength; char *tmp = data; data = newchar [len+1]; for(int i = 0; i < start; i++){ data[i] = tmp[i]; } for(int i = start + deletelength; tmp[i] != '\0'; i++){ data[i - deletelength] = tmp[i]; } data[len] = '\0'; delete[] tmp; } }
//get the length of the string intseqString::length()const{ return len; }
//judge whether the string is empty boolseqString::isEmpty()const{ return len == 0; }
//get a substring from a certain position and a certain length seqString seqString::substr(int start, int sublength)const{ if(start < 0 || start >= len){ throw std::runtime_error("Invalid starting position!"); }else{ if(sublength + start >= len){ sublength = len - start; }
//the overload function of << and >> operator std::istream& operator>>(std::istream& is, seqString& s) { constint bufferSize = 1024; char buffer[bufferSize]; is >> buffer;
private: node *head; //head pointer int length; //length of the string int nodesize; //capacity for every block node
//several private function tools voidclear(); //release all the memories voidfindPos(int start, int& pos, node *&p)const; // find the position of the certain node voidsplit(node *p, int pos); //split nodes voidmerge(node *p); //merge nodes
public: linkString(constchar* s = ""); // Constructor with default parameter linkString(const linkString& other); // Copy constructor ~linkString(); // Destructor linkString& operator=(const linkString& other); // Assignment operator overload linkString& operator+=(const linkString& other); // Compound assignment operator overload (concatenation) linkString& operator-=(const linkString& other); // Compound assignment operator overload (removal) charoperator[](int index) const; // Subscript operator overload voidinsert(int start, const linkString& other, int insertsize = -1); // Insert operation voidremove(int deletelength, int start = -1); // Remove operation intlength()const; // Get the length of the string boolisEmpty()const; // Check if the string is empty linkString substr(int start, int leng)const; // Get a substring };
研究表明,块状链表的节点容量和节点个数相同的时候,算法效率最高。
因此,节点容量一个设置为:sqrt(length) ,来尽可能提高算法的效率。
附上数学推导:
块状链表的查找操作需要遍历块(链表部分)和在块内查找(数组部分)。
如果节点容量过大,块内查找的时间会增加(因为数组部分变长)。
如果节点容量过小,块的数量会增加,导致链表部分变长,遍历块的代价增加。
当节点容量和节点个数相同时,查找和修改的代价达到平衡,整体效率最高。
假设块状链表有 ( n ) 个元素,块的大小为 ( B ),块的数量为 ( M ),则 n = BM。
// copy the new linkstring p = head = newnode(1); length = other.length; nodesize = other.nodesize; while (otherp != nullptr) { p = p->next = newnode(nodesize); for (; (p->size) < (otherp->size); ++(p->size)) { // if p->size == other->size ,then all the valuable data has been copied successfully. p->data[p->size] = otherp->data[p->size]; } otherp = otherp->next; // traverse the other link string } }
/** * @brief clear all the memory of the linked list (private func) */ voidlinkString::clear() { node *p = head->next, *nextp; // nextp store next pointer temporarily while (p != nullptr) { nextp = p->next; delete p; p = nextp; } }
/** * @brief get the length of the string (public func) * @return the length of string */ intlinkString::getlength()const { return length; }
/** * @brief judge whether the string is empty */ boollinkString::isEmpty()const { return length == 0; }
/** * @brief overload of assignment operator * * @param other the rvalue (const reference) of the assignment operator * @return the reference of the assigned lvalue */ linkString &linkString::operator=(const linkString &other) { if (&other == this) return *this;
// Clear the current list clear();
// Copy the length and nodesize length = other.length; nodesize = other.nodesize;
while (otherp != nullptr) { p->next = newnode(nodesize); p = p->next;
// Copy the data for (int i = 0; i < otherp->size; ++i) { p->data[i] = otherp->data[i]; } p->size = otherp->size;
// Move to the next node in the other list otherp = otherp->next; }
return *this; }
/** * @brief find the starting position of a given position * * @param start the given position (or index) * @param pos find the position in the node (reference) * @param p find the node where the start lies in */ voidlinkString::findPos(int start, int &pos, node *&p)const { int count = 0; // define a counter p = head->next; // the first node(pointer)
while (count < start) { if (count + p->size < start) { // the start isn't in this node! count += p->size; p = p->next; } else { // the start is in this node! pos = start - count; return; } } }
/** * @brief get a substring * * @param start the start position * @param sublength the length of the substring * @throws std::out_of_range If pos is out of range. */ linkString linkString::substr(int start, int sublength)const { linkString tmp; // storing result int count = 0; int pos = 0; node *p, *tp = tmp.head;
if (start < 0 || start >= length) { throw std::out_of_range("Invalid starting position!"); } else { // if the length left is smaller than the given sublength: if (start + sublength >= length) { sublength = length - start; }
// use the findPos function to find p and pos findPos(start, pos, p);
// after getting the starting position, we can copy the substring! for (int index = 0; index < tmp.length;) { /* * two for-loop layers, index represents the substring index * inner loop is used to copy every block, while the outer loop is used to update the block. */ tp = tp->next = newnode(tmp.nodesize); while (tp->size <= tmp.nodesize && index < tmp.length) { if (pos == p->size) { /* * the traverse has ended in the block. * p needs to be updated to the next block. */ p = p->next; pos = 0; } tp->data[tp->size] = p->data[pos];
/** * @brief Split operations for a block (one to two) (private func). * * @param p The node to be split. * @param pos The specific index where the split happens (the pos position is moved to the second node). * @throws std::out_of_range If pos is out of range. */ voidlinkString::split(node *p, int pos) { // Check if p is nullptr if (p == nullptr) { throw std::invalid_argument("The node to be split is null."); }
// Check if pos is within the valid range if (pos < 0 || pos > p->size) { throw std::out_of_range("The split position is out of range."); }
// Create a new node and insert it after p node *nextp = newnode(nodesize, p->next); p->next = nextp;
// Move data from p to nextp for (int i = pos; i < p->size; i++) { nextp->data[i - pos] = p->data[i]; }
// Update sizes of both nodes nextp->size = p->size - pos; // Size of the new node p->size = pos; // Size of the original node after split }
/** * @brief merge two nodes into one * * @param p one of the nodes that need to be merged */ voidlinkString::merge(node *p) { if (p == nullptr || p->next == nullptr) return; node *nextp = p->next; if (p->size + nextp->size <= nodesize) { // merge available for (int pos = 0; pos < nextp->size; ++pos, ++p->size) { p->data[p->size] = nextp->data[pos]; } p->next = nextp->next; delete nextp; } }
/** * @brief insert a string into linked string, using the findPos, split and merge functions * * @param start the starting position (using findPos) * @param other the string that needs to be inserted * @param insertsize the length of the string needs to be inserted, default value for all string */ voidlinkString::insert(int start, const linkString &other, int insertsize) { if (insertsize == -1) { insertsize = other.length; }
if (start < 0 || start > length) { throw std::out_of_range("The insert position is out of range"); }
/* * pos and p is calculated by func findPos, representing the insert position * nextp is right after p, which is used for the split func * after the insert, the merge function will be used */ node *p, *nextp, *tmp; int pos = 0; findPos(start, pos, p); split(p, pos);
nextp = p->next; // nextp is used for stroage in case the link string is broken linkString tobeinserted = other.substr(0, insertsize); tmp = tobeinserted.head->next; while (tmp != nullptr) { // operate for every node for (pos = 0; pos < tmp->size; ++pos) { if (p->size == nodesize) { // need for expansion p = p->next = newnode(nodesize); } p->data[p->size] = tmp->data[pos]; ++p->size; } tmp = tmp->next; }
length += insertsize; p->next = nextp; merge(p); // see whether the merge is available }
/** * @brief Remove the specific part of a linked string * * @param deletelength the length to be deleted * @param start the starting position that needs tobe removed (the start itself is included), default for remove from the back */ voidlinkString::remove(int deletelength, int start) { if (start == -1) { start = length - deletelength; }
// find the position to be removed node *startp; // represent the starting position to be deleted int pos = 0; findPos(start, pos, startp);
split(startp, pos); // split the node
if (start + deletelength >= length) { deletelength = length - start; length = start; // if the deletelength goes over the edge } else { length -= deletelength; }
while (true) { node *nextp = startp->next; if (deletelength > nextp->size) { // the end node is not here! Then delete this node! deletelength -= nextp->size; startp->next = nextp->next; delete nextp; } else { // the end node is here! split(nextp, deletelength); startp->next = nextp->next; // now the nextp is all the data needs tobe deleted, while nextp->next stores the data remaining delete nextp; break; // jump out of the loop after the remove operation is done! } } merge(startp); // merge all the startp; }
/** * @brief Overloads the [] operator to access a character at a specific index in the linkString. * * @param index The position of the character to access (0-based index). * @return The character at the specified index. * @throws std::out_of_range If the index is invalid (less than 0 or greater than or equal to the string length). */ char linkString::operator[](int index) const { if (index < 0 || index >= length) { throw std::out_of_range("Index " + std::to_string(index) + " is out of range. Valid range is [0, " + std::to_string(length - 1) + "]."); }
node *current_node; // Pointer to the node containing the target character int node_position = 0; // Position within the node findPos(index, node_position, current_node);
return current_node->data[node_position]; }
/** * @brief the overload function of += operator * * @param other the linkString needs to be appended * @return the reference of the appended string */ linkString &linkString::operator+=(const linkString &other) { this->insert(length, other); return *this; }
/** * @brief the overload function of -= operator * * @param other the linkString needs to be poped (removed) * @return the reference of the string */ linkString &linkString::operator-=(const linkString &other) { linkString tmp = substr(length - other.length, other.length); if (tmp == other) { this->remove(length - other.length, other.length); return *this; } else { throw std::invalid_argument("The string cannot be removed!"); } }
/** * @brief the overload function of + operator * * @param s1 the left plus string * @param s2 the right plus string * * @return a tmp value of the added string */ linkString operator+(const linkString &s1, const linkString &s2) { linkString ans = s1; ans += s2; return std::move(ans); }
/** * @brief judge whether the two strings is equal * * @param s1 the left string needs to be judged * @param s2 the right string needs to be judged * * @return a bool value */ booloperator==(const linkString &s1, const linkString &s2) { if (s1.length != s2.length) returnfalse; // uses two pointers to traverse the linkString
linkString::node *p1 = s1.head->next; linkString::node *p2 = s2.head->next; int current_pos_for_s1 = 0; int current_pos_for_s2 = 0;
while (p1 && p2) { // the end statement:one of the node* reacheds to the end if (p1->data[current_pos_for_s1] != p2->data[current_pos_for_s2]) { returnfalse; }
// update the index current_pos_for_s1++; current_pos_for_s2++;
// update the node if (current_pos_for_s1 == p1->size) { p1 = p1->next; current_pos_for_s1 = 0; }
/** * @brief judge whether the two strings are not equal * * All the parameters are equal to the == function */ booloperator!=(const linkString &s1, const linkString &s2) { return !(s1 == s2); }
// several functions for string comparison
/** * @brief the sorted function for string comparison, if s1 > s2, then the func returns true. */ booloperator>(const linkString &s1, const linkString &s2) { // pointers for traverse linkString::node *p1 = s1.head->next; linkString::node *p2 = s2.head->next; int current_pos_for_s1 = 0; int current_pos_for_s2 = 0;
while (p1 != nullptr) { if (p1->data[current_pos_for_s1] < p2->data[current_pos_for_s2]) { returnfalse; }
if (p1->data[current_pos_for_s1] > p2->data[current_pos_for_s2]) { returntrue; }
// update the index current_pos_for_s1++; current_pos_for_s2++;
// update the node if (current_pos_for_s1 == p1->size) { p1 = p1->next; current_pos_for_s1 = 0; }
if (current_pos_for_s2 == p2->size) { p2 = p2->next; current_pos_for_s2 = 0; } } // whatever p2 is (nullptr or not), there is no possilbility s1>s2! returnfalse; }
/** * @brief the overload function of << operator(for output) * * @param os reference to the istream classes * @param s the string that needs to be printed. * * @return the l-value for os */ std::ostream &operator<<(std::ostream &os, const linkString &s) { // for traverse linkString::node *p = s.head->next;
while (p != nullptr) { for (int index = 0; index < p->size; index++) { os << p->data[index]; // because the string doesnot have '\0', we cannot << it at a time. } p = p->next; }
return os; }
/** * @brief the overload function of >> operator(for input) * * @param is reference to the istream classes * @param s the string that needs to be printed. * * @return the l-value for is */ std::istream &operator>>(std::istream &is, linkString &s) { constint maxinputsize = 1024; char *newstring = newchar[maxinputsize]; int currrentlength = 0; newstring[0] = '\0';
In computer science, the Knuth–Morris–Pratt algorithm (or KMP algorithm) is a string-searching algorithm that searches for occurrences of a “word” W within a main “text string” S by employing the observation that when a mismatch occurs, the word itself embodies sufficient information to determine where the next match could begin, thus bypassing re-examination of previously matched characters. [1][2]
KMP算法中的K是Knuth老爷子!也就是图灵奖得主,《The Art of Computer Programming》巨著的作者!
voidoutermatch(seqString target, seqString totalString, int (*string_match_algorithm[])(seqString, seqString), int func_choice) { int index = string_match_algorithm[func_choice](target, totalString); std::cout << "Using " << func_choice << "th algorithm" << std::endl; if (index == -1) { std::cout << "Unfortunately, the match fails." << std::endl; std::cout << "The string " << totalString << " does not have a substring " << target << std::endl; } else { std::cout << "Got it!" << std::endl; std::cout << "There exits a target string with the index of " << index << " and with the length of " << target.length() << std::endl; std::cout << "Which means there exists a substr: " << totalString.substr(index, target.length()) << " ,which matches with target: " << target << std::endl; } }
/** * @brief The most complex implementation of string-matching, using enumeration * * @param target The target string needs to be matched * @param total_string The whole string that needs to be detected * * @return the index for the first match. return -1 if no match occurs. */ intcomplexStringMatch(seqString target, seqString totalString) { int target_length = target.length(); int total_length = totalString.length();
if (total_length < target_length) { returnfalse; }
for (int i = 0; i + target_length <= total_length; ++i) { seqString tmp = totalString.substr(i, target_length);
if (tmp == target) { // match successfully return i; } } return-1; }
// KMP algorithm implementation intKMP(seqString pattern, seqString text){ if (pattern.isEmpty()) return0; // Empty pattern matches at the start
int textLen = text.length(); int patLen = pattern.length(); std::vector<int> lps(patLen, 0); computeLPS(pattern, lps);
int i = 0; // Index for text int j = 0; // Index for pattern
while (i < textLen) { if (pattern[j] == text[i]) { i++; j++; }
if (j == patLen) { return i - j; // Match found, return the starting index } elseif (i < textLen && pattern[j] != text[i]) { if (j != 0) { j = lps[j - 1]; // Fallback using the LPS array } else { i++; // No fallback possible, move to the next character in text } } }
voidcomputeLPS(const seqString& pattern, std::vector<int>& lps){ int len = 0; // Length of the previous longest prefix suffix lps[0] = 0; // lps[0] is always 0 int i = 1;
while (i < pattern.length()) { if (pattern[i] == pattern[len]) { len++; lps[i] = len; i++; } else { if (len != 0) { len = lps[len - 1]; // Fallback to the previous longest prefix suffix //特别注意:此处并未执行i++,因为这个位置的值还没有确定,需要再更新 } else { lps[i] = 0; // No matching prefix suffix i++; } } } }
/* *The following code is for string-match problems *We use our handmade string for sequence implementation */ // declaration voidoutermatch(seqString target, seqString totalString, int (*string_match_algorithm[])(seqString, seqString), int func_choice); intcomplexStringMatch(seqString target, seqString totalString); intKMP(seqString target, seqString totalString); voidcomputeLPS(const seqString &pattern, std::vector<int> &next);
/** * @brief The outer message output of string-matching problems * * @param target The target string needs to be matched * @param total_string The whole string that needs to be detected * @param string_match_algorithm The specific string-match problems * @param func_choice Choose one algorithm */ voidoutermatch(seqString target, seqString totalString, int (*string_match_algorithm[])(seqString, seqString), int func_choice) { int index = string_match_algorithm[func_choice](target, totalString); std::cout << "Using " << func_choice << "th algorithm" << std::endl; if (index == -1) { std::cout << "Unfortunately, the match fails." << std::endl; std::cout << "The string " << totalString << " does not have a substring " << target << std::endl; } else { std::cout << "Got it!" << std::endl; std::cout << "There exits a target string with the index of " << index << " and with the length of " << target.length() << std::endl; std::cout << "Which means there exists a substr: " << totalString.substr(index, target.length()) << " ,which matches with target: " << target << std::endl; } }
/** * !KMP algorithm * @brief the Knuth-Morris-Pratt (KMP) algorithm * * @param text The main text string to search within * @param pattern The pattern string to search for * @return The first occurrence index of the pattern in the text, or -1 if not found */ voidcomputeLPS(const seqString& pattern, std::vector<int>& lps){ int len = 0; // Length of the previous longest prefix suffix lps[0] = 0; // lps[0] is always 0 int i = 1;
while (i < pattern.length()) { if (pattern[i] == pattern[len]) { len++; lps[i] = len; i++; } else { if (len != 0) { len = lps[len - 1]; // Fallback to the previous longest prefix suffix } else { lps[i] = 0; // No matching prefix suffix i++; } } } }
// KMP algorithm implementation intKMP(seqString pattern, seqString text){ if (pattern.isEmpty()) return0; // Empty pattern matches at the start
int textLen = text.length(); int patLen = pattern.length(); std::vector<int> lps(patLen, 0); computeLPS(pattern, lps);
int i = 0; // Index for text int j = 0; // Index for pattern
while (i < textLen) { if (pattern[j] == text[i]) { i++; j++; }
if (j == patLen) { return i - j; // Match found, return the starting index } elseif (i < textLen && pattern[j] != text[i]) { if (j != 0) { j -= lps[j - 1]; // Fallback using the LPS array } else { i++; // No fallback possible, move to the next character in text } } } return-1; // No match found }
voidtestPackageForKMP() { seqString s1("Hello my name is YXY and I love learning DS"); seqString targets[] = {"Hello", "my", "MM"}; int (*string_match_algorithm[])(seqString, seqString) = {complexStringMatch, KMP};
for (int i = 0; i < 2; i++) { for (auto target : targets) { outermatch(target, s1, string_match_algorithm, i); } } }