Oracle AI Vector Search User's Guide는 SQL Quick Start Using a FLOAT32 Vector Generator 절에서 FLOAT32 벡터를 생성할 수 있는 패키지를 제공합니다. 이 패키지는 차원 값이 커질수록 CLOB 타입의 벡터에서 좌표 값을 가져오는 get_coordinate 함수에 의해 성능이 급격히 저하됩니다.
아래 예제는 500 차원의 벡터를 100 로우 생성한 결과를 보여줍니다.
-- 1
BEGIN
vector_gen_pkg.generate_vectors (
num_vectors => 100 -- Number of vectors to generate. Must be 1 or above
, dimensions => 500 -- Number of dimensions of each vector. Must be above 1 but less than 500
, num_clusters => 6 -- Number of clusters to create. Must be 1 or above
, cluster_spread => 1 -- Relative closeness of each vector in each cluster (using standard deviation). Must be grather than 0
, min_value => 0 -- Minimum value for a vector coordinate
, max_value => 100 -- Maximum value for a vector coordinate. Min value must be smaller than max value
);
END;
/
Elapsed: 00:01:20.00
아래는 Varray를 사용하여 성능을 개선한 FLOAT32 Vector Generator 패키지입니다.
-- 2-1
CREATE OR REPLACE PACKAGE vector_gen_pkg
AS
PROCEDURE generate_vectors (
num_vectors IN PLS_INTEGER -- Number of vectors to generate
, dimensions IN PLS_INTEGER -- Number of dimensions of each vector
, num_clusters IN PLS_INTEGER -- Number of clusters to create
, cluster_spread IN NUMBER -- Relative closeness of each vector in each cluster (using standard deviation)
, min_value IN NUMBER -- Minimum value for a vector coordinate
, max_value IN NUMBER -- Maximum value for a vector coordinate
);
END vector_gen_pkg;
/
SHOW ERROR
-- 2-2
CREATE OR REPLACE PACKAGE BODY vector_gen_pkg
AS
TYPE t_vector IS VARRAY(500) OF NUMBER;
PROCEDURE generate_random_vector (
dimensions IN PLS_INTEGER
, min_value IN NUMBER
, max_value IN NUMBER
, vec OUT t_vector
)
IS
e t_vector := t_vector ();
BEGIN
FOR i IN 1 .. dimensions LOOP
e.EXTEND;
e(e.LAST) := DBMS_RANDOM.VALUE (min_value, max_value);
END LOOP;
vec := e;
END generate_random_vector;
PROCEDURE generate_clustered_vector (
centroid IN t_vector
, cluster_spread IN NUMBER
, vec OUT t_vector
)
IS
e t_vector := t_vector ();
BEGIN
FOR i IN 1 .. centroid.COUNT LOOP
e.EXTEND;
e(e.LAST) := centroid(i) + (DBMS_RANDOM.NORMAL * cluster_spread);
END LOOP;
vec := e;
END generate_clustered_vector;
FUNCTION normalize_vector (
vec IN t_vector
)
RETURN t_vector
IS
e t_vector := t_vector ();
n NUMBER;
BEGIN
n := VECTOR_NORM (VECTOR (JSON_SERIALIZE (JSON (vec))));
FOR i IN 1 .. vec.COUNT LOOP
e.EXTEND;
e(i) := vec(i) / n;
END LOOP;
RETURN e;
END normalize_vector;
PROCEDURE generate_vectors (
num_vectors IN PLS_INTEGER -- Must be 1 or above
, dimensions IN PLS_INTEGER -- Must be above 1 but less than 500
, num_clusters IN PLS_INTEGER -- Must be 1 or above
, cluster_spread IN NUMBER -- Must be grather than 0
, min_value IN NUMBER
, max_value IN NUMBER
)
IS
TYPE t_vectors IS TABLE OF t_vector INDEX BY PLS_INTEGER;
centroids t_vectors;
vectors_per_cluster PLS_INTEGER;
remaining_vectors PLS_INTEGER;
vec t_vector;
idx PLS_INTEGER := 1;
max_id NUMBER;
working_vector t_vector;
v VECTOR;
nv VECTOR;
BEGIN
IF (num_vectors) <= 0
OR (num_clusters < 1)
OR (num_vectors < num_clusters)
OR (dimensions <= 0)
OR (dimensions > 500)
OR (cluster_spread <= 0)
OR (min_value >= max_value)
THEN
RETURN;
END IF;
SELECT MAX (ID) INTO max_id FROM genvec;
IF max_id IS NULL THEN
max_id := 0;
END IF;
-- Generate cluster centroids
FOR i IN 1 .. num_clusters LOOP
generate_random_vector (dimensions, min_value, max_value, centroids(i));
working_vector := normalize_vector (centroids(i));
v := VECTOR (JSON_SERIALIZE (JSON (centroids(i))));
nv := VECTOR (JSON_SERIALIZE (JSON (working_vector)));
INSERT
INTO genvec
VALUES (max_id + idx, v, 'C' || i, nv, DBMS_RANDOM.VALUE (3, 600000000));
idx := idx + 1;
END LOOP;
-- Calculate vectors per cluster
vectors_per_cluster := TRUNC (num_vectors / num_clusters);
remaining_vectors := num_vectors MOD num_clusters;
-- Generate vectors for each cluster
IF vectors_per_cluster > 1 THEN
FOR i IN 1 .. num_clusters LOOP
FOR j IN 1 .. (vectors_per_cluster - 1) LOOP
generate_clustered_vector (centroids(i), cluster_spread, vec);
working_vector := normalize_vector (vec);
v := VECTOR (JSON_SERIALIZE (JSON (centroids(i))));
nv := VECTOR (JSON_SERIALIZE (JSON (working_vector)));
INSERT
INTO genvec
VALUES (max_id + idx, v, 'C' || i || '-' || j, nv, DBMS_RANDOM.VALUE (3, 600000000));
idx := idx + 1;
END LOOP;
END LOOP;
END IF;
-- Handle remaining vectors: all associated with cluster 1
IF remaining_vectors > 0 THEN
FOR j IN 1 .. remaining_vectors LOOP
generate_clustered_vector (centroids (1), cluster_spread, vec);
working_vector := normalize_vector (vec);
v := VECTOR (JSON_SERIALIZE (JSON (centroids(1))));
nv := VECTOR (JSON_SERIALIZE (JSON (working_vector)));
INSERT
INTO genvec
VALUES (max_id + idx, v, 'C1-' || idx, nv, DBMS_RANDOM.VALUE (3, 600000000));
idx := idx + 1;
END LOOP;
END IF;
COMMIT;
END generate_vectors;
END vector_gen_pkg;
/
SHOW ERROR
아래 예제는 500 차원의 벡터를 100 로우 생성한 결과를 보여줍니다. 수행 시간이 80초에서 0.66초로 감소한 것을 볼 수 있습니다.
-- 3
BEGIN
vector_gen_pkg.generate_vectors (
num_vectors => 100 -- Number of vectors to generate. Must be 1 or above
, dimensions => 500 -- Number of dimensions of each vector. Must be above 1 but less than 500
, num_clusters => 6 -- Number of clusters to create. Must be 1 or above
, cluster_spread => 1 -- Relative closeness of each vector in each cluster (using standard deviation). Must be grather than 0
, min_value => 0 -- Minimum value for a vector coordinate
, max_value => 100 -- Maximum value for a vector coordinate. Min value must be smaller than max value
);
END;
/
Elapsed: 00:00:00.66