You've already forked crosspoint-reader
mirror of
https://github.com/crosspoint-reader/crosspoint-reader.git
synced 2026-04-29 10:26:52 -07:00
302dea1eea
Switch KOReader sync progress mapping from chapter matching to XPath-based mapping. - resolves KOReader positions using real XHTML ancestry paths - supports paragraph-based upload mapping with text offsets where needed - passes the current paragraph index into sync so uploads map back to KOReader more accurately No HTTP client changes are included. No reader-state or resume-flow changes are included. --------- Co-authored-by: jpirnay <jens@pirnay.com>
564 lines
15 KiB
C++
564 lines
15 KiB
C++
#include "ChapterXPathResolver.h"
|
|
|
|
#include <Logging.h>
|
|
#include <Print.h>
|
|
#include <Utf8.h>
|
|
#include <XmlParserUtils.h>
|
|
#include <expat.h>
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
namespace {
|
|
std::string stripPrefix(const XML_Char* name) {
|
|
if (!name) {
|
|
return "";
|
|
}
|
|
|
|
const char* local = std::strrchr(name, ':');
|
|
return local ? std::string(local + 1) : std::string(name);
|
|
}
|
|
|
|
struct NameCounter {
|
|
std::string name;
|
|
int count;
|
|
};
|
|
|
|
struct ParentState {
|
|
std::vector<NameCounter> children;
|
|
|
|
int nextIndex(const std::string& name) {
|
|
for (auto& child : children) {
|
|
if (child.name == name) {
|
|
child.count++;
|
|
return child.count;
|
|
}
|
|
}
|
|
|
|
children.push_back({name, 1});
|
|
return 1;
|
|
}
|
|
};
|
|
|
|
struct PathSegment {
|
|
std::string name;
|
|
int index;
|
|
};
|
|
|
|
std::string buildParagraphXPath(const int spineIndex, const std::vector<PathSegment>& path, const int charOffset) {
|
|
std::string xpath = "/body/DocFragment[" + std::to_string(spineIndex + 1) + "]/body";
|
|
for (const auto& segment : path) {
|
|
xpath += "/" + segment.name + "[" + std::to_string(segment.index) + "]";
|
|
}
|
|
if (charOffset > 0) {
|
|
xpath += "/text()." + std::to_string(charOffset);
|
|
}
|
|
return xpath;
|
|
}
|
|
|
|
size_t countUtf8Codepoints(const XML_Char* data, const int len) {
|
|
if (!data || len <= 0) {
|
|
return 0;
|
|
}
|
|
|
|
size_t count = 0;
|
|
const unsigned char* ptr = reinterpret_cast<const unsigned char*>(data);
|
|
const unsigned char* end = ptr + len;
|
|
while (ptr < end) {
|
|
utf8NextCodepoint(&ptr);
|
|
count++;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
class ParagraphTextCounter final : public Print {
|
|
public:
|
|
ParagraphTextCounter() {
|
|
parser = XML_ParserCreate(nullptr);
|
|
if (!parser) {
|
|
LOG_ERR("KOX", "Failed to create XML parser");
|
|
return;
|
|
}
|
|
|
|
XML_SetUserData(parser, this);
|
|
XML_SetElementHandler(parser, &ParagraphTextCounter::startElement, &ParagraphTextCounter::endElement);
|
|
XML_SetCharacterDataHandler(parser, &ParagraphTextCounter::characterData);
|
|
}
|
|
|
|
~ParagraphTextCounter() override { destroyXmlParser(parser); }
|
|
|
|
bool ok() const { return parser != nullptr && parseOk; }
|
|
|
|
bool finish() {
|
|
if (!parser || !parseOk || stopped) {
|
|
return parseOk;
|
|
}
|
|
|
|
if (XML_Parse(parser, "", 0, XML_TRUE) == XML_STATUS_ERROR) {
|
|
LOG_ERR("KOX", "Final XML parse error: %s", XML_ErrorString(XML_GetErrorCode(parser)));
|
|
parseOk = false;
|
|
}
|
|
return parseOk;
|
|
}
|
|
|
|
size_t write(uint8_t c) override { return write(&c, 1); }
|
|
|
|
size_t write(const uint8_t* buffer, size_t size) override {
|
|
if (!parser || !parseOk || stopped) {
|
|
return size;
|
|
}
|
|
|
|
if (XML_Parse(parser, reinterpret_cast<const char*>(buffer), static_cast<int>(size), XML_FALSE) != XML_STATUS_OK) {
|
|
const enum XML_Error error = XML_GetErrorCode(parser);
|
|
if (error != XML_ERROR_ABORTED) {
|
|
LOG_ERR("KOX", "XML parse error: %s", XML_ErrorString(error));
|
|
parseOk = false;
|
|
}
|
|
}
|
|
|
|
return size;
|
|
}
|
|
|
|
size_t totalVisibleChars() const { return visibleChars; }
|
|
|
|
private:
|
|
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char**) {
|
|
auto* self = static_cast<ParagraphTextCounter*>(userData);
|
|
self->onStartElement(name);
|
|
}
|
|
|
|
static void XMLCALL endElement(void* userData, const XML_Char* name) {
|
|
auto* self = static_cast<ParagraphTextCounter*>(userData);
|
|
self->onEndElement(name);
|
|
}
|
|
|
|
static void XMLCALL characterData(void* userData, const XML_Char* data, const int len) {
|
|
auto* self = static_cast<ParagraphTextCounter*>(userData);
|
|
self->onCharacterData(data, len);
|
|
}
|
|
|
|
void onStartElement(const XML_Char* rawName) {
|
|
const std::string name = stripPrefix(rawName);
|
|
|
|
if (!insideBody) {
|
|
if (name == "body") {
|
|
insideBody = true;
|
|
bodyDepth = depth;
|
|
}
|
|
depth++;
|
|
return;
|
|
}
|
|
|
|
if (name == "p") {
|
|
paragraphDepth++;
|
|
}
|
|
depth++;
|
|
}
|
|
|
|
void onEndElement(const XML_Char* rawName) {
|
|
const std::string name = stripPrefix(rawName);
|
|
|
|
depth--;
|
|
if (!insideBody) {
|
|
return;
|
|
}
|
|
|
|
if (depth == bodyDepth && name == "body") {
|
|
insideBody = false;
|
|
return;
|
|
}
|
|
|
|
if (name == "p" && paragraphDepth > 0) {
|
|
paragraphDepth--;
|
|
}
|
|
}
|
|
|
|
void onCharacterData(const XML_Char* data, const int len) {
|
|
if (!insideBody || paragraphDepth <= 0 || len <= 0) {
|
|
return;
|
|
}
|
|
|
|
visibleChars += countUtf8Codepoints(data, len);
|
|
}
|
|
|
|
private:
|
|
XML_Parser parser = nullptr;
|
|
bool parseOk = true;
|
|
bool insideBody = false;
|
|
bool stopped = false;
|
|
int depth = 0;
|
|
int bodyDepth = -1;
|
|
int paragraphDepth = 0;
|
|
size_t visibleChars = 0;
|
|
};
|
|
|
|
class XPathParagraphResolver final : public Print {
|
|
public:
|
|
explicit XPathParagraphResolver(const int targetParagraph) : targetParagraph(targetParagraph) {
|
|
parser = XML_ParserCreate(nullptr);
|
|
if (!parser) {
|
|
LOG_ERR("KOX", "Failed to create XML parser");
|
|
return;
|
|
}
|
|
|
|
XML_SetUserData(parser, this);
|
|
XML_SetElementHandler(parser, &XPathParagraphResolver::startElement, &XPathParagraphResolver::endElement);
|
|
}
|
|
|
|
~XPathParagraphResolver() override { destroyXmlParser(parser); }
|
|
|
|
bool ok() const { return parser != nullptr && parseOk; }
|
|
|
|
bool finish() {
|
|
if (!parser || !parseOk || stopped) {
|
|
return parseOk;
|
|
}
|
|
|
|
if (XML_Parse(parser, "", 0, XML_TRUE) == XML_STATUS_ERROR) {
|
|
LOG_ERR("KOX", "Final XML parse error: %s", XML_ErrorString(XML_GetErrorCode(parser)));
|
|
parseOk = false;
|
|
}
|
|
return parseOk;
|
|
}
|
|
|
|
bool hasMatch() const { return !xpath.empty(); }
|
|
const std::string& getXPath() const { return xpath; }
|
|
|
|
size_t write(uint8_t c) override { return write(&c, 1); }
|
|
|
|
size_t write(const uint8_t* buffer, size_t size) override {
|
|
if (!parser || !parseOk || stopped) {
|
|
return size;
|
|
}
|
|
|
|
if (XML_Parse(parser, reinterpret_cast<const char*>(buffer), static_cast<int>(size), XML_FALSE) != XML_STATUS_OK) {
|
|
const enum XML_Error error = XML_GetErrorCode(parser);
|
|
if (error != XML_ERROR_ABORTED) {
|
|
LOG_ERR("KOX", "XML parse error: %s", XML_ErrorString(error));
|
|
parseOk = false;
|
|
}
|
|
}
|
|
|
|
return size;
|
|
}
|
|
|
|
int spineIndex = 0;
|
|
|
|
private:
|
|
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char**) {
|
|
auto* self = static_cast<XPathParagraphResolver*>(userData);
|
|
self->onStartElement(name);
|
|
}
|
|
|
|
static void XMLCALL endElement(void* userData, const XML_Char* name) {
|
|
auto* self = static_cast<XPathParagraphResolver*>(userData);
|
|
self->onEndElement(name);
|
|
}
|
|
|
|
void onStartElement(const XML_Char* rawName) {
|
|
const std::string name = stripPrefix(rawName);
|
|
|
|
if (!insideBody) {
|
|
if (name == "body") {
|
|
insideBody = true;
|
|
bodyDepth = depth;
|
|
parentStates.emplace_back();
|
|
}
|
|
depth++;
|
|
return;
|
|
}
|
|
|
|
const int siblingIndex = parentStates.back().nextIndex(name);
|
|
path.push_back({name, siblingIndex});
|
|
parentStates.emplace_back();
|
|
|
|
if (name == "p") {
|
|
paragraphCount++;
|
|
if (paragraphCount == targetParagraph) {
|
|
xpath = buildParagraphXPath(spineIndex, path, 0);
|
|
stopped = true;
|
|
XML_StopParser(parser, XML_FALSE);
|
|
}
|
|
}
|
|
|
|
depth++;
|
|
}
|
|
|
|
void onEndElement(const XML_Char* rawName) {
|
|
const std::string name = stripPrefix(rawName);
|
|
|
|
depth--;
|
|
if (!insideBody) {
|
|
return;
|
|
}
|
|
|
|
if (depth == bodyDepth && name == "body") {
|
|
insideBody = false;
|
|
parentStates.clear();
|
|
path.clear();
|
|
return;
|
|
}
|
|
|
|
if (!path.empty()) {
|
|
path.pop_back();
|
|
}
|
|
if (!parentStates.empty()) {
|
|
parentStates.pop_back();
|
|
}
|
|
}
|
|
|
|
XML_Parser parser = nullptr;
|
|
const int targetParagraph;
|
|
bool parseOk = true;
|
|
bool insideBody = false;
|
|
bool stopped = false;
|
|
int depth = 0;
|
|
int bodyDepth = -1;
|
|
int paragraphCount = 0;
|
|
std::vector<ParentState> parentStates;
|
|
std::vector<PathSegment> path;
|
|
std::string xpath;
|
|
};
|
|
|
|
class XPathProgressResolver final : public Print {
|
|
public:
|
|
explicit XPathProgressResolver(const size_t targetVisibleChar) : targetVisibleChar(targetVisibleChar) {
|
|
parser = XML_ParserCreate(nullptr);
|
|
if (!parser) {
|
|
LOG_ERR("KOX", "Failed to create XML parser");
|
|
return;
|
|
}
|
|
|
|
XML_SetUserData(parser, this);
|
|
XML_SetElementHandler(parser, &XPathProgressResolver::startElement, &XPathProgressResolver::endElement);
|
|
XML_SetCharacterDataHandler(parser, &XPathProgressResolver::characterData);
|
|
}
|
|
|
|
~XPathProgressResolver() override { destroyXmlParser(parser); }
|
|
|
|
bool ok() const { return parser != nullptr && parseOk; }
|
|
|
|
bool finish() {
|
|
if (!parser || !parseOk || stopped) {
|
|
return parseOk;
|
|
}
|
|
|
|
if (XML_Parse(parser, "", 0, XML_TRUE) == XML_STATUS_ERROR) {
|
|
LOG_ERR("KOX", "Final XML parse error: %s", XML_ErrorString(XML_GetErrorCode(parser)));
|
|
parseOk = false;
|
|
}
|
|
return parseOk;
|
|
}
|
|
|
|
bool hasMatch() const { return !xpath.empty(); }
|
|
const std::string& getXPath() const { return xpath; }
|
|
|
|
size_t write(uint8_t c) override { return write(&c, 1); }
|
|
|
|
size_t write(const uint8_t* buffer, size_t size) override {
|
|
if (!parser || !parseOk || stopped) {
|
|
return size;
|
|
}
|
|
|
|
if (XML_Parse(parser, reinterpret_cast<const char*>(buffer), static_cast<int>(size), XML_FALSE) != XML_STATUS_OK) {
|
|
const enum XML_Error error = XML_GetErrorCode(parser);
|
|
if (error != XML_ERROR_ABORTED) {
|
|
LOG_ERR("KOX", "XML parse error: %s", XML_ErrorString(error));
|
|
parseOk = false;
|
|
}
|
|
}
|
|
|
|
return size;
|
|
}
|
|
|
|
int spineIndex = 0;
|
|
|
|
private:
|
|
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char**) {
|
|
auto* self = static_cast<XPathProgressResolver*>(userData);
|
|
self->onStartElement(name);
|
|
}
|
|
|
|
static void XMLCALL endElement(void* userData, const XML_Char* name) {
|
|
auto* self = static_cast<XPathProgressResolver*>(userData);
|
|
self->onEndElement(name);
|
|
}
|
|
|
|
static void XMLCALL characterData(void* userData, const XML_Char* data, const int len) {
|
|
auto* self = static_cast<XPathProgressResolver*>(userData);
|
|
self->onCharacterData(data, len);
|
|
}
|
|
|
|
void onStartElement(const XML_Char* rawName) {
|
|
const std::string name = stripPrefix(rawName);
|
|
|
|
if (!insideBody) {
|
|
if (name == "body") {
|
|
insideBody = true;
|
|
bodyDepth = depth;
|
|
parentStates.emplace_back();
|
|
}
|
|
depth++;
|
|
return;
|
|
}
|
|
|
|
const int siblingIndex = parentStates.back().nextIndex(name);
|
|
path.push_back({name, siblingIndex});
|
|
parentStates.emplace_back();
|
|
|
|
if (name == "p") {
|
|
paragraphDepth++;
|
|
paragraphVisibleChars = 0;
|
|
}
|
|
|
|
depth++;
|
|
}
|
|
|
|
void onEndElement(const XML_Char* rawName) {
|
|
const std::string name = stripPrefix(rawName);
|
|
|
|
depth--;
|
|
if (!insideBody) {
|
|
return;
|
|
}
|
|
|
|
if (depth == bodyDepth && name == "body") {
|
|
insideBody = false;
|
|
parentStates.clear();
|
|
path.clear();
|
|
return;
|
|
}
|
|
|
|
if (name == "p" && paragraphDepth > 0) {
|
|
paragraphDepth--;
|
|
paragraphVisibleChars = 0;
|
|
}
|
|
|
|
if (!path.empty()) {
|
|
path.pop_back();
|
|
}
|
|
if (!parentStates.empty()) {
|
|
parentStates.pop_back();
|
|
}
|
|
}
|
|
|
|
void onCharacterData(const XML_Char* data, const int len) {
|
|
if (!insideBody || paragraphDepth <= 0 || len <= 0 || stopped) {
|
|
return;
|
|
}
|
|
|
|
const size_t codepointCount = countUtf8Codepoints(data, len);
|
|
const size_t nextVisibleChars = visibleChars + codepointCount;
|
|
if (targetVisibleChar <= nextVisibleChars) {
|
|
const size_t delta = targetVisibleChar - visibleChars;
|
|
const int charOffset = static_cast<int>(paragraphVisibleChars + delta);
|
|
xpath = buildParagraphXPath(spineIndex, path, std::max(1, charOffset));
|
|
stopped = true;
|
|
XML_StopParser(parser, XML_FALSE);
|
|
return;
|
|
}
|
|
|
|
visibleChars = nextVisibleChars;
|
|
paragraphVisibleChars += codepointCount;
|
|
}
|
|
|
|
XML_Parser parser = nullptr;
|
|
const size_t targetVisibleChar;
|
|
bool parseOk = true;
|
|
bool insideBody = false;
|
|
bool stopped = false;
|
|
int depth = 0;
|
|
int bodyDepth = -1;
|
|
int paragraphDepth = 0;
|
|
size_t visibleChars = 0;
|
|
size_t paragraphVisibleChars = 0;
|
|
std::vector<ParentState> parentStates;
|
|
std::vector<PathSegment> path;
|
|
std::string xpath;
|
|
};
|
|
} // namespace
|
|
|
|
std::string ChapterXPathResolver::findXPathForParagraph(const std::shared_ptr<Epub>& epub, const int spineIndex,
|
|
const uint16_t paragraphIndex) {
|
|
if (!epub || paragraphIndex == 0 || spineIndex < 0 || spineIndex >= epub->getSpineItemsCount()) {
|
|
return "";
|
|
}
|
|
|
|
const auto href = epub->getSpineItem(spineIndex).href;
|
|
if (href.empty()) {
|
|
return "";
|
|
}
|
|
|
|
XPathParagraphResolver resolver(paragraphIndex);
|
|
if (!resolver.ok()) {
|
|
return "";
|
|
}
|
|
|
|
resolver.spineIndex = spineIndex;
|
|
if (!epub->readItemContentsToStream(href, resolver, 1024) || !resolver.finish()) {
|
|
return "";
|
|
}
|
|
|
|
if (resolver.hasMatch()) {
|
|
LOG_DBG("KOX", "Resolved paragraph %u in spine %d -> %s", paragraphIndex, spineIndex, resolver.getXPath().c_str());
|
|
return resolver.getXPath();
|
|
}
|
|
|
|
LOG_DBG("KOX", "Paragraph %u not found in spine %d", paragraphIndex, spineIndex);
|
|
return "";
|
|
}
|
|
|
|
std::string ChapterXPathResolver::findXPathForProgress(const std::shared_ptr<Epub>& epub, const int spineIndex,
|
|
const float intraSpineProgress) {
|
|
if (!epub || spineIndex < 0 || spineIndex >= epub->getSpineItemsCount()) {
|
|
return "";
|
|
}
|
|
|
|
const auto href = epub->getSpineItem(spineIndex).href;
|
|
if (href.empty()) {
|
|
return "";
|
|
}
|
|
|
|
if (!(intraSpineProgress > 0.0f)) {
|
|
return "/body/DocFragment[" + std::to_string(spineIndex + 1) + "]/body";
|
|
}
|
|
|
|
ParagraphTextCounter counter;
|
|
if (!counter.ok() || !epub->readItemContentsToStream(href, counter, 1024) || !counter.finish()) {
|
|
return "";
|
|
}
|
|
|
|
const size_t totalVisibleChars = counter.totalVisibleChars();
|
|
if (totalVisibleChars == 0) {
|
|
return "";
|
|
}
|
|
|
|
const float clamped = std::max(0.0f, std::min(1.0f, intraSpineProgress));
|
|
const size_t targetVisibleChar =
|
|
std::max<size_t>(1, std::min(totalVisibleChars, static_cast<size_t>(std::ceil(clamped * totalVisibleChars))));
|
|
|
|
XPathProgressResolver resolver(targetVisibleChar);
|
|
if (!resolver.ok()) {
|
|
return "";
|
|
}
|
|
|
|
resolver.spineIndex = spineIndex;
|
|
if (!epub->readItemContentsToStream(href, resolver, 1024) || !resolver.finish()) {
|
|
return "";
|
|
}
|
|
|
|
if (resolver.hasMatch()) {
|
|
LOG_DBG("KOX", "Resolved progress %.3f in spine %d -> %s", intraSpineProgress, spineIndex,
|
|
resolver.getXPath().c_str());
|
|
return resolver.getXPath();
|
|
}
|
|
|
|
LOG_DBG("KOX", "Could not resolve progress %.3f in spine %d", intraSpineProgress, spineIndex);
|
|
return "";
|
|
}
|