After rigourous searching I found Andrei Ionescu post on this. I liked his method but reading through the comments I saw someone saying something about regular expressions and I thought to myself why not use regular expressions to help preprocess the html instead of character parsing.
Eventually I came up with this method and here I post it for anyone interested, feel free to use it.
/**Returns the html index of the text index specified*/ private function calculateHtmlIndex(htmlStr:String, textIndex:int):int { var htmlCounter:int = 0; var textCounter:int = 0; //the following entities have size zero, the rest have size one. var sizeOneEntities:String = "|<|>|&|"|'|<BR>|</P>|"; // characters that appears when a tag starts var openTags:String = "<&"; // characters that appears when a tag ends // var closeTags:String = ">;"; /**generates an array of tokens which are tags, entities and text. */ var tagPattern:RegExp = /<[^<>\s\/]*\s*[^<>]*[^P]?>|&(?:lt|gt|amp|quot|apos);|[^<>&]*/g; var tokensArray:Array = htmlStr.match(tagPattern); var i:int = 0; var currentToken:String; var isHtmlToken:Boolean ; while (textCounter <= textIndex) { currentToken = tokensArray[i++]; isHtmlToken = (openTags.indexOf(currentToken.charAt(0)) != -1) ; if (isHtmlToken) { //this token is HMTL or its a string begining with & and ending with ; and is shorter than 4 chars htmlCounter += currentToken.length; if (sizeOneEntities.indexOf("|" + currentToken + "|") != -1) textCounter++; //this html corresponds to 1 textual character } else if ((textCounter + currentToken.length) >= textIndex) { //the text index is inside the current token htmlCounter += textIndex - textCounter; textCounter += textIndex - textCounter;//not needed break; } else { //add the text size to html and text counters and move on htmlCounter += currentToken.length; textCounter += currentToken.length; } } return htmlCounter; } |
No comments:
Post a Comment