00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
#include "SentencesBlocks.h"
00042
00043
namespace PLearn {
00044
using namespace std;
00045
00048 SentencesBlocks::SentencesBlocks(
int n_blocks,
VMat d,
Vec separator)
00049 :
TVec<
VMat>(n_blocks)
00050 {
00051
if (n_blocks==1)
00052 {
00053 (*this)[0]=d;
00054
return;
00055 }
00056
int total_size = d->
length();
00057
if (total_size < n_blocks * 2)
00058
PLERROR(
"SentencesBlocks: can't have blocks of size < 2 in average");
00059
Vec v(d->
width());
00060
int b=0;
00061
int previous_previous_block=0,previous_beginning_of_block = 0, previous_beginning_of_sentence=0;
00062
int next_target = (
int)(total_size / (
real)n_blocks);
00063
for (
int i=0;i<total_size && b<n_blocks-1;i++)
00064 {
00065 d->getRow(i,v);
00066
if (v==separator)
00067 {
00068
if (i>=next_target)
00069 {
00070
int cut=0;
00071
if (i-next_target < next_target-previous_beginning_of_sentence ||
00072 previous_beginning_of_sentence < previous_beginning_of_block)
00073 cut=i+1;
00074
else
00075 {
00076 cut=previous_beginning_of_sentence;
00077 previous_beginning_of_sentence = i+1;
00078 }
00079 (*this)[b++] = d.
subMatRows(previous_beginning_of_block,
00080 cut-previous_beginning_of_block);
00081 previous_previous_block = previous_beginning_of_block;
00082 previous_beginning_of_block=cut;
00083
if (b<n_blocks)
00084 {
00085
if (b>n_blocks-3)
00086 next_target = (
int)((total_size - cut) / (
real)(n_blocks-b));
00087
else
00088 next_target = (
int)(total_size * (
real)(b+1.0) / n_blocks);
00089 }
00090 }
00091
else
00092 previous_beginning_of_sentence=i+1;
00093 }
00094 }
00095
if (b==n_blocks-1)
00096 (*this)[b++] = d.
subMatRows(previous_beginning_of_block,
00097 total_size-previous_beginning_of_block);
00098
if (b<n_blocks-1)
00099 {
00100
if (b<n_blocks-2)
00101
PLERROR(
"SentencesBlocks: blocks are too small!");
00102
if (previous_beginning_of_sentence<previous_beginning_of_block)
00103
PLERROR(
"SentencesBlocks: Blocks are too small!");
00104
int cut = previous_beginning_of_sentence;
00105 (*this)[b++] = d.
subMatRows(previous_beginning_of_block,
00106 cut-previous_beginning_of_block);
00107 previous_beginning_of_block=cut;
00108 (*this)[b++] = d.
subMatRows(previous_beginning_of_block,
00109 total_size-previous_beginning_of_block);
00110 }
00111 }
00112
00113 }