Oracle 23ai(23.6)에 Relational Data Vectorization 기능이 추가되었습니다. 이 기능을 사용하면 관계형 데이터로 벡터를 생성할 수 있습니다.
테스트를 위해 Bank Marketing 데이터를 다운로드하여 아래와 같이 압축을 해제하겠습니다. bank-additional-full.csv 파일의 로우 수는 41,189 건입니다.
$ unzip bank+marketing.zip
$ unzip bank-additional.zip
$ cd bank-additional
$ wc -l bank-additional-full.csv
41189 bank-additional-full.csv
아래와 같이 External 테이블를 사용하여 bank-additional-full.csv 파일을 bank 테이블로 적재하겠습니다.
-- 1-1
DROP DIRECTORY dir_bank;
CREATE DIRECTORY dir_bank AS '/home/oracle/bank/bank-additional';
-- 1-2
DROP TABLE ext_bank PURGE;
CREATE TABLE ext_bank (
age NUMBER
, job VARCHAR2(100)
, marital VARCHAR2(100)
, education VARCHAR2(100)
, credit_default VARCHAR2(100)
, housing VARCHAR2(100)
, loan VARCHAR2(100)
, contact VARCHAR2(100)
, month VARCHAR2(100)
, day_of_week VARCHAR2(100)
, duration NUMBER
, campaign NUMBER
, pdays NUMBER
, previous NUMBER
, poutcome VARCHAR2(100)
, emp_var_rate NUMBER
, cons_price_idx NUMBER
, cons_conf_idx NUMBER
, euribor3m NUMBER
, nr_employed NUMBER
, y VARCHAR2(100)
)
ORGANIZATION EXTERNAL (
TYPE ORACLE_LOADER
DEFAULT DIRECTORY dir_bank
ACCESS PARAMETERS (
RECORDS
SKIP 1
FIELDS CSV WITH EMBEDDED TERMINATED BY ";" OPTIONALLY ENCLOSED BY '"'
MISSING FIELD VALUES ARE NULL
)
LOCATION ('bank-additional-full.csv')
);
-- 1-3
DROP TABLE bank PURGE;
CREATE TABLE bank AS SELECT ROWNUM AS id, a.* FROM ext_bank a;
Elapsed: 00:00:00.27
-- 1-4
SELECT COUNT (*) AS cnt FROM bank;
CNT
-----
41188
1 row selected.
아래와 같이 DBMS_DATA_MINING 패키지를 사용해 모델을 생성하겠습니다. OML Feature Extraction은 데이터를 압축된 숫자 공간(벡터)에 투영합니다. 예제는 SVD 알고리즘을 구성하여 원본 데이터 테이블에 대해 Principal Component Analysis (PCA) 투영을 수행합니다. Feature 개수(칼럼 수와 벡터 수)는 5개입니다.
-- 2-1
DECLARE
v_setlist DBMS_DATA_MINING.SETTING_LIST;
BEGIN
v_setlist(DBMS_DATA_MINING.ALGO_NAME) := DBMS_DATA_MINING.ALGO_SINGULAR_VALUE_DECOMP;
v_setlist(DBMS_DATA_MINING.PREP_AUTO) := DBMS_DATA_MINING.PREP_AUTO_ON;
v_setlist(DBMS_DATA_MINING.SVDS_SCORING_MODE) := DBMS_DATA_MINING.SVDS_SCORING_PCA;
v_setlist(DBMS_DATA_MINING.FEAT_NUM_FEATURES) := '5';
DBMS_DATA_MINING.DROP_MODEL (
model_name => 'BANK_MODEL'
, force => TRUE
);
DBMS_DATA_MINING.CREATE_MODEL2 (
model_name => 'BANK_MODEL'
, mining_function => DBMS_DATA_MINING.FEATURE_EXTRACTION
, data_query => 'SELECT * FROM BANK'
, set_list => v_setlist
, case_id_column_name => 'ID'
);
END;
/
-- 2-2
SELECT table_name, num_rows FROM user_tables WHERE table_name LIKE '%BANK_MODEL' ORDER BY table_name;
TABLE_NAME NUM_ROWS
--------------- --------
DM$P5BANK_MODEL 68
DM$PABANK_MODEL 5
DM$PBBANK_MODEL 315
DM$PCBANK_MODEL 0
DM$PDBANK_MODEL 63
DM$PMBANK_MODEL 21
DM$PPBANK_MODEL 7
7 rows selected.
아래와 같이 벡터 테이블과 벡터 인덱스를 생성하겠습니다.
-- 3-1
SELECT id, VECTOR_EMBEDDING (bank_model USING *) AS embedding FROM bank WHERE id = 1;
ID EMBEDDING
-- --------------------------------------------------------------------------------------------------------------------------
1 [-4.5287840901963311E-001,3.9973657855390535E+001,1.7814307848694632E+001,1.6308564650063971E+001,2.9979559766641883E+000]
1 row selected.
-- 3-2
DROP TABLE bank_vector PURGE;
CREATE TABLE bank_vector AS SELECT id, VECTOR_EMBEDDING (bank_model USING *) AS vector FROM bank;
-- 3-3
CREATE VECTOR INDEX bank_vector_x1 ON bank_vector (vector) ORGANIZATION NEIGHBOR PARTITIONS;
아래 예제는 id 1번 로우와 유사한 로우를 검색합니다. id 1번 로우의 Feature 칼럼은 cons_conf_idx, pdays, euribor3m, contact, campaign입니다.
-- 4-1
SELECT FEATURE_DETAILS (bank_model, 5 USING *) AS features FROM bank WHERE id = 1;
FEATURES
----------------------------------------------------------------------------
<Details algorithm="Singular Value Decomposition" feature="5">
<Attribute name="CONS_CONF_IDX" actualValue="-36.4" weight=".697" rank="1"/>
<Attribute name="PDAYS" actualValue="999" weight=".023" rank="2"/>
<Attribute name="EURIBOR3M" actualValue="4.857" weight=".014" rank="3"/>
<Attribute name="CONTACT" actualValue="telephone" weight=".009" rank="4"/>
<Attribute name="CAMPAIGN" actualValue="1" weight=".008" rank="5"/>
</Details>
1 row selected.
-- 4-2
VAR b_vector CLOB
EXEC SELECT vector INTO :b_vector FROM bank_vector WHERE id = 1;
SELECT a.id, b.age, b.marital, b.education, b.credit_default, b.housing
, b.cons_conf_idx, b.pdays, b.euribor3m, b.contact, b.campaign
FROM bank_vector a, bank b
WHERE a.id != 1
AND b.id = a.id
ORDER BY VECTOR_DISTANCE (a.vector, TO_VECTOR (:b_vector, 5, FLOAT64))
FETCH FIRST 3 ROWS ONLY;
ID AGE MARITAL EDUCATION CREDIT_DEFAULT HOUSING CONS_CONF_IDX PDAYS EURIBOR3M CONTACT CAMPAIGN
---- --- ------- ---------- -------------- ------- ------------- ----- --------- --------- --------
3631 56 married basic.9y unknown yes -36.4 999 4.859 telephone 1
22 55 married basic.4y unknown yes -36.4 999 4.857 telephone 1
459 54 married basic.4y no no -36.4 999 4.857 telephone 2
3 rows selected.
-- 4-3
SELECT MIN (cons_conf_idx) AS min, MAX (cons_conf_idx) AS max, AVG (cons_conf_idx) AS avg, STDDEV (cons_conf_idx) AS std FROM bank;
MIN MAX AVG STD
------ ------ ------ ----
-50.80 -26.90 -40.50 4.63
1 row selected.
아래 예제는 id 10000번 로우와 유사한 로우를 검색합니다. id 10000번 로우의 Feature 칼럼은 pdays, euribor3m, contact, emp_var_rate, day_of_week입니다.
-- 5-1
SELECT FEATURE_DETAILS (bank_model, 5 USING *) AS features FROM bank WHERE id = 10000;
FEATURES
--------------------------------------------------------------------------
<Details algorithm="Singular Value Decomposition" feature="5">
<Attribute name="PDAYS" actualValue="999" weight=".041" rank="1"/>
<Attribute name="EURIBOR3M" actualValue="4.959" weight=".028" rank="2"/>
<Attribute name="CONTACT" actualValue="telephone" weight=".016" rank="3"/>
<Attribute name="EMP_VAR_RATE" actualValue="1.4" weight=".014" rank="4"/>
<Attribute name="DAY_OF_WEEK" actualValue="wed" weight=".002" rank="5"/>
</Details>
1 row selected.
-- 5-2
VAR b_vector CLOB
EXEC SELECT vector INTO :b_vector FROM bank_vector WHERE id = 10000;
SELECT a.id, b.age, b.marital, b.education, b.credit_default, b.housing
, b.pdays, b.euribor3m, b.contact, b.emp_var_rate, b.day_of_week
FROM bank_vector a, bank b
WHERE a.id != 10000
AND b.id = a.id
ORDER BY VECTOR_DISTANCE (a.vector, TO_VECTOR (:b_vector, 5, FLOAT64))
FETCH FIRST 3 ROWS ONLY;
ID AGE MARITAL EDUCATION CREDIT_DEFAULT HOUSING PDAYS EURIBOR3M CONTACT EMP_VAR_RATE DAY_OF_WEEK
----- --- ------- ----------------- -------------- ------- ----- ---------- --------- ------------ -----------
9416 40 married basic.9y no no 999 4.967 telephone 1.4 fri
13485 40 married basic.4y unknown no 999 4.963 telephone 1.4 thu
9800 40 single university.degree no yes 999 4.959 telephone 1.4 wed
3 rows selected.
-- 5-3
SELECT MIN (pdays) AS min, MAX (pdays) AS max, AVG (pdays) AS avg, STDDEV (pdays) AS std FROM bank;
MIN MAX AVG STD
---- ------ ------ ------
0.00 999.00 962.48 186.91
1 row selected.
관련 링크