xxx.fdt主要存储了每个Field相应的内容, 格式如下:
void DocumentsWriter::ThreadState::writeDocument() {
// If we hit an exception while appending to the
// stored fields or term vectors files, we have to
// abort all documents since we last flushed because
// it means those files are possibly inconsistent.
try {
_parent->numDocsInStore++;
// Append stored fields to the real FieldsWriter:
_parent->fieldsWriter->flushDocument(numStoredFields, fdtLocal); //文档个数
fdtLocal->reset();
}
每一个字段的内容:
if (field->isStored()) {
threadState->numStoredFields++;
bool success = false;
try {
threadState->localFieldsWriter->writeField(fieldInfo, field);
success = true;
} _CLFINALLY(
字段函数:
void FieldsWriter::writeField(FieldInfo* fi, CL_NS(document)::Field* field)
{
// if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
// and field.binaryValue() already returns the compressed value for a field
// with isCompressed()==true, so we disable compression in that case
bool disableCompression = (field->instanceOf(FieldsReader::FieldForMerge::getClassName()));
fieldsStream->writeVInt(fi->number);
uint8_t bits = 0;
if (field->isTokenized())
bits |= FieldsWriter::FIELD_IS_TOKENIZED;
if (field->isBinary())
bits |= FieldsWriter::FIELD_IS_BINARY;
if (field->isCompressed())
bits |= FieldsWriter::FIELD_IS_COMPRESSED;
fieldsStream->writeByte(bits);
if ( field->isCompressed() ){
// compression is enabled for the current field
CL_NS(util)::ValueArray<uint8_t> dataB;
const CL_NS(util)::ValueArray<uint8_t>* data = &dataB;
if (disableCompression) {
// optimized case for merging, the data
// is already compressed
data = field->binaryValue();
} else {
// check if it is a binary field
if (field->isBinary()) {
compress(*field->binaryValue(), dataB);
}else if ( field->stringValue() == NULL ){ //we must be using readerValue
CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")
Reader* r = field->readerValue();
int32_t sz = r->size();
if ( sz < 0 )
sz = 10000000; //todo: we should warn the developer here....
//read the entire string
const TCHAR* rv = NULL;
int64_t rl = r->read(rv, sz, 1);
if ( rl > LUCENE_INT32_MAX_SHOULDBE )
_CLTHROWA(CL_ERR_Runtime,"Field length too long");
else if ( rl < 0 )
rl = 0;
string str = lucene_wcstoutf8string(rv, rl);
CL_NS(util)::ValueArray<uint8_t> utfstr;
utfstr.length = str.length();
utfstr.values = (uint8_t*)str.c_str();
compress(utfstr, dataB);
utfstr.values = NULL;
}else if ( field->stringValue() != NULL ){
string str = lucene_wcstoutf8string(field->stringValue(), LUCENE_INT32_MAX_SHOULDBE);
CL_NS(util)::ValueArray<uint8_t> utfstr;
utfstr.length = str.length();
utfstr.values = (uint8_t*)str.c_str();
compress(utfstr, dataB);
utfstr.values = NULL;
}
}
fieldsStream->writeVInt(data->length);
fieldsStream->writeBytes(data->values, data->length);
}else{
//FEATURE: this problem in Java Lucene too, if using Reader, data is not stored.
//todo: this is a logic bug...
//if the field is stored, and indexed, and is using a reader the field wont get indexed
//
//if we could write zero prefixed vints (therefore static length), then we could
//write a reader directly to the field indexoutput and then go back and write the data
//length. however this is not supported in lucene yet...
//if this is ever implemented, then it would make sense to also be able to combine the
//FieldsWriter and DocumentWriter::invertDocument process, and use a streamfilter to
//write the field data while the documentwrite analyses the document! how cool would
//that be! it would cut out all these buffers!!!
// compression is disabled for the current field
if (field->isBinary()) {
const CL_NS(util)::ValueArray<uint8_t>* data = field->binaryValue();
fieldsStream->writeVInt(data->length);
fieldsStream->writeBytes(data->values, data->length);
}else if ( field->stringValue() == NULL ){ //we must be using readerValue
CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")
Reader* r = field->readerValue();
int32_t sz = r->size();
if ( sz < 0 )
sz = 10000000; //todo: we should warn the developer here....
//read the entire string
const TCHAR* rv;
int64_t rl = r->read(rv, sz, 1);
if ( rl > LUCENE_INT32_MAX_SHOULDBE )
_CLTHROWA(CL_ERR_Runtime,"Field length too long");
else if ( rl < 0 )
rl = 0;
fieldsStream->writeString( rv, (int32_t)rl);
}else if ( field->stringValue() != NULL ){
fieldsStream->writeString(field->stringValue(),_tcslen(field->stringValue()));
}else
_CLTHROWA(CL_ERR_Runtime, "No values are set for the field");
}
}
number数 文档中字段的索引值,第一个字段的值为0, 第二个字段的值为1(1-2个字节)
属性(1个字节)
内容
xxx.fdx是对xxx.fdt的索引。以每个doc作为界限。存储的是 每个doc在xxx.fdt的pos.
主要代码为:
void FieldsWriter::flushDocument(int32_t numStoredFields, CL_NS(store)::RAMOutputStream* buffer) {
indexStream->writeLong(fieldsStream->getFilePointer());
fieldsStream->writeVInt(numStoredFields);
buffer->writeTo(fieldsStream);
}