forked from datastax/cassandra-data-migrator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsparkConf.properties
145 lines (120 loc) · 7.11 KB
/
sparkConf.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
spark.origin.isAstra false
#spark.origin.host <cass_host>
#spark.origin.username <cass_user>
#spark.origin.password <cass_pass>
#spark.origin.keyspaceTable <keyspacename.tablename>
spark.target.isAstra true
#spark.target.scb <secure_connect_bundle_name>
#spark.target.username <astra_client_id>
#spark.target.password <astra_secret>
#spark.target.keyspaceTable <keyspacename.tablename>
spark.target.autocorrect.missing true
spark.target.autocorrect.mismatch true
#spark.files <secure-bundle-path>
#spark.tokenRange.exceptionDir <dir-for-failed-rows-file>
#spark.row.exceptionDir <dir-for-failed-token-file>
#Max file size allowed in bytes. 200000000 = 200MB
spark.rowfailure.filesize.limit 200000000
spark.maxRetries 10
spark.maxRetries.rowFailure 2
spark.readRateLimit 200000
spark.writeRateLimit 200000
spark.batchSize 5
#spark.query.origin db_name, tenant_id, tenant_uuid
#spark.query.origin.partitionKey db_name, tenant_id
#spark.query.target db_name, tenant_id, tenant_uuid
#spark.query.target.id db_name, tenant_id
#spark.query.types 0,0,9
#spark.query.ttl.cols 2
#spark.query.writetime.cols 2
spark.target.default.ttl.enable false
#spark.target.default.ttl 7776000
spark.target.default.writetime.enable false
#spark.target.default.writetime 1640998861000
spark.counterTable false
spark.counterTable.cql
spark.counterTable.cql.index 0
spark.splitSize 60000
#spark.origin.writeTimeStampFilter false
#spark.origin.minWriteTimeStampFilter 0
#spark.origin.maxWriteTimeStampFilter 9223372036854775807
spark.coveragePercent 100
spark.origin.read.consistency.level LOCAL_QUORUM
spark.target.read.consistency.level LOCAL_QUORUM
##### ENABLE ONLY IF COLUMN NAMES ON TARGET IS DIFFERENT FROM ORIGIN (SCHEMA & DATA-TYPES MUST BE SAME) #####
#spark.query.target partition-key,clustering-key,order-date,amount
################# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME DATA BASED ON CQL FILTER #################
#spark.query.condition
################# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME % (NOT 100%) DATA ######################
#spark.coveragePercent 10
#################### ENABLE ONLY IF WANT LOG STATS MORE OR LESS FREQUENTLY THAN DEFAULT #####################
#spark.printStatsAfter 100000
################################# ENABLE ONLY IF IT IS A COUNTER TABLE ######################################
#spark.counterTable false
#spark.counterTable.cql
#spark.counterTable.cql.index 0
######## ENABLE ONLY IF YOU WANT TO FILTER BASED ON WRITE-TIME (values must be in microseconds) #############
#spark.origin.writeTimeStampFilter false
#spark.origin.minWriteTimeStampFilter 0
#spark.origin.maxWriteTimeStampFilter 4102444800000000
######## ENABLE ONLY IF YOU WANT TO USE READ AND/OR WRITE CONSISTENCY OTHER THAN LOCAL_QUORUM ##############
#spark.consistency.read LOCAL_QUORUM
#spark.consistency.write LOCAL_QUORUM
############# ENABLE ONLY IF YOU WANT TO REDUCE FETCH-SIZE TO AVOID FrameTooLongException ##################
#spark.read.fetch.sizeInRows 1000
############### ENABLE ONLY IF YOU WANT TO USE CUSTOM FIXED WRITETIME VALUE ON TARGET ######################
#spark.target.custom.writeTime 0
#################### ONLY USE if SKIPPING recs greater than 10MB from Origin needed #########################
#spark.fieldGuardraillimitMB 10
#################### ONLY USE if count of recs greater than 10MB from Origin needed #########################
#spark.origin.checkTableforColSize false
#spark.origin.checkTableforColSize.cols partition-key,clustering-key
#spark.origin.checkTableforColSize.cols.types 9,1
############################ ONLY USE if needing to filter data from Origin #################################
#spark.origin.FilterData false
#spark.origin.FilterColumn test
#spark.origin.FilterColumnIndex 2
#spark.origin.FilterColumnType 6%16
#spark.origin.FilterColumnValue test
########################## ONLY USE if SSL clientAuth is enabled on origin Cassandra/DSE ####################
#spark.origin.trustStore.path
#spark.origin.trustStore.password
#spark.origin.trustStore.type JKS
#spark.origin.keyStore.path
#spark.origin.keyStore.password
#spark.origin.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
####################### ONLY USE if SSL clientAuth is enabled on target Cassandra/DSE #######################
#spark.target.trustStore.path
#spark.target.trustStore.password
#spark.target.trustStore.type JKS
#spark.target.keyStore.path
#spark.target.keyStore.password
#spark.target.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
#############################################################################################################
# Following are the supported data types and their corresponding [Cassandra data-types]
# 0: ascii, text, varchar
# 1: int, smallint
# 2: bigint, counter
# 3: double
# 4: time, timestamp
# 5: map (separate type by %) - Example: 5%1%0 for map<int, text>
# 6: list (separate type by %) - Example: 6%0 for list<text>
# 7: blob
# 8: set (separate type by %) - Example: 8%0 for set<text>
# 9: uuid, timeuuid
# 10: boolean
# 11: tuple
# 12: float
# 13: tinyint
# 14: decimal
# 15: date
# 16: UDT [any user-defined-type created using 'CREATE TYPE']
# 17: varint
# Note: Ignore "Frozen" while mapping Collections (Map/List/Set) - Example: 5%1%0 for frozen<map<int, text>>
#
# "spark.query.ttl.cols" - Comma separated column indexes from "spark.query.origin" used to find largest TTL.
# "spark.query.writetime.cols" - Comma separated column indexes from "spark.query.origin" used to find largest writetime.
# Note: The tool migrates TTL & Writetimes at row-level and not field-level.
# Migration will use the largest TTL & Writetimes value per row.
#
#############################################################################################################