-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathLoad-ImdbStaging.ps1
112 lines (80 loc) · 3.17 KB
/
Load-ImdbStaging.ps1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
$sqlServerConnstring = "Data Source=.\SQL2022;Initial Catalog=IMDB;Integrated Security=True"
# List of URLs to download
$urls = @(
"https://datasets.imdbws.com/name.basics.tsv.gz",
"https://datasets.imdbws.com/title.akas.tsv.gz",
"https://datasets.imdbws.com/title.basics.tsv.gz",
"https://datasets.imdbws.com/title.crew.tsv.gz",
"https://datasets.imdbws.com/title.episode.tsv.gz",
"https://datasets.imdbws.com/title.principals.tsv.gz",
"https://datasets.imdbws.com/title.ratings.tsv.gz"
)
# Download each file
foreach ($url in $urls) {
Invoke-WebRequest -Uri $url -OutFile ([System.IO.Path]::GetFileName($url))
}
# Import System.IO.Compression.FileSystem assembly
Add-Type -AssemblyName System.IO.Compression.FileSystem
# Define the path to the .gz file
$files = Get-Item './*.tsv.gz'
# Connect to your SQL Server
$conn = New-Object System.Data.SqlClient.SqlConnection($sqlServerConnstring)
$conn.Open()
foreach($file in $files) {
$file.Name
$rowNumber=0
# Open the .gz file
$gzipFileStream = [System.IO.File]::OpenRead($file.FullName)
# Create a GZipStream object for decompression
$decompressionStream = New-Object System.IO.Compression.GZipStream($gzipFileStream, [System.IO.Compression.CompressionMode]::Decompress)
# Create a stream reader to read from the decompression stream
$streamReader = New-Object System.IO.StreamReader($decompressionStream)
# Create a command
$cmd = $conn.CreateCommand()
# Begin a transaction
$transaction = $conn.BeginTransaction()
$cmd.Connection = $conn
$cmd.Transaction = $transaction
try {
$cmd.CommandText = "TRUNCATE TABLE [Raw].["+$file.Name+"];"
$cmd.ExecuteNonQuery() | Out-Null
# Read line by line from the .gz file
while (($line = $streamReader.ReadLine()) -ne $null) {
# Parse the line and insert it into SQL Server table
if ($rowNumber -gt 0) {
if ($line -replace("\\N", "") -replace("`t", "") -ne "") {
$cmd.CommandText = "INSERT INTO [Raw].["+$file.Name+"] VALUES ("+(("N'"+($line -replace("'", "''"))+"'") -replace("`t", "', N'") -replace("N'\\N'", "NULL"))+");"
$cmd.ExecuteNonQuery() | Out-Null
}
}
$rowNumber=$rowNumber+1
# For every 10,000 rows, commit the transaction and start a new
# transaction. This effectively batches the log writes, which
# improves the overall INSERT performance.
if (($rowNumber%10000) -eq 0) {
"Processed: $rowNumber"
# Commit the transaction
$transaction.Commit()
# Begin a transaction
$transaction = $conn.BeginTransaction()
$cmd.Connection = $conn
$cmd.Transaction = $transaction
}
}
}
catch {
# Oh no
$cmd.CommandText
$_
$transaction.Rollback()
}
"Processed: $rowNumber"
# Commit the transaction
$transaction.Commit()
}
# Close the connections and streams
$conn.Close()
$streamReader.Close()
$decompressionStream.Close()
$gzipFileStream.Close()