{"id":29957,"date":"2024-02-16T03:30:26","date_gmt":"2024-03-08T22:54:22","guid":{"rendered":"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/"},"modified":"2025-08-12T16:46:47","modified_gmt":"2025-08-12T08:46:47","slug":"2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82","status":"publish","type":"post","link":"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/","title":{"rendered":"2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]"},"content":{"rendered":"<p>\uff082022.1.24\u4fee\u6b63\uff09<br \/>\n\u4e3a\u4e86\u4f7fSpark\u5728Windows\u4e0a\u8fd0\u884c\uff0c\u5bf9winutils\u8fdb\u884c\u4e86\u4fee\u6b63\u3002<br \/>\n\u4e3a\u4e86\u5728Windows\u73af\u5883\u4e2d\u8fd0\u884cSpark\uff0c\u5e38\u5e38\u53c2\u8003cdarlint\/winutils\u548csteveloughran\/winutils\uff0c\u4f46\u662f\u53d1\u73b0\u8fd9\u4e9b\u4e8c\u8fdb\u5236\u6587\u4ef6\u5728Windows11\u73af\u5883\u4e0b\u65e0\u6cd5\u8fd0\u884c\uff0c\u56e0\u6b64\u91cd\u65b0\u6784\u5efa\u4e86winutils\uff0c\u5e76\u5728GitHub\u4e0a\u8fdb\u884c\u4e86\u516c\u5f00\u53d1\u5e03\u3002<br \/>\n\u4e0b\u9762\u5c06\u8be6\u7ec6\u8bf4\u660e\u8be5\u8fc7\u7a0b\u3002<\/p>\n<hr>\n<\/hr>\n<h2>0. \u4ec0\u4e48\u662fPySpark\uff1f<\/p>\n<p>PySpark\u662f\u4e00\u4e2a\u5229\u7528Apache Spark\u8fdb\u884c\u5206\u5e03\u5f0f\u5904\u7406\u7b49\u64cd\u4f5c\u7684\u5e93\uff0c\u7528\u4e8e\u5206\u6790\u5927\u5bb9\u91cf\u6570\u636e\u3002\u5b83\u662f\u5728Python\u754c\u9762\u4e0b\u53ef\u7528\u7684\u3002<\/p>\n<p>\u7531\u65bc\u5b83\u53ef\u4ee5\u9069\u61c9\u591a\u7a2e\u6578\u64da\u6e90\u4e26\u9032\u884c\u9748\u6d3b\u7684\u6578\u64da\u8655\u7406\uff0c\u56e0\u6b64\u5b83\u975e\u5e38\u9069\u5408\u5728\u7d71\u4e00\u6578\u64da\u8655\u7406\u6642\u4f7f\u7528\u3002<\/p>\n<div>\n<div class=\"post-table\">\n<thead>\n<tr>\n<th style=\"text-align: left\">\u7528\u8a9e<\/th>\n<th style=\"text-align: left\">\u5185\u5bb9<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td style=\"text-align: left\">Apache Spark<\/td>\n<td style=\"text-align: left\">\u5de8\u5927\u306a\u30c7\u30fc\u30bf\u3092\u9ad8\u901f\u306b\u51e6\u7406\u3067\u304d\u308b<\/td>\n<\/tr>\n<tr>\n<td style=\"text-align: left\">PySpark<\/td>\n<td style=\"text-align: left\">Apache Spark\u3092Python\u304b\u3089\u4f7f\u3046\u3053\u3068\u304c\u3067\u304d\u308b\u3088\u3046\u306b\u3057\u305f\u30e9\u30a4\u30d6\u30e9\u30ea<\/td>\n<\/tr>\n<\/tbody>\n<p>\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u5c06\u4e3b\u8981\u8ba8\u8bba\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark\u3002<\/p>\n<h2>1. Apache Spark (\u963f\u5e15\u5947\u00b7\u65af\u5df4\u514b)<\/p>\n<h3>1.1. \u914d\u7f6e Apache Spark \u7684\u73af\u5883<\/p>\n<p>\u8bf7\u6309\u7167winutils\u7684README.md\u6587\u4ef6\u6765\u914d\u7f6e\u73af\u5883\u3002<\/p>\n<h3>1.2. \u78ba\u8a8d\u52d5\u4f5c<\/p>\n<p>\u5982\u679c\u4e0a\u8ff0\u64cd\u4f5c\u5b8c\u6210\u4e86\uff0c\u6211\u4eec\u5c06\u5c1d\u8bd5\u4ece\u547d\u4ee4\u884c\u542f\u52a8spark-shell\u3002<\/p>\n<pre class=\"post-pre\"><code>&gt;spark-shell\r\n<\/code><\/pre>\n<h2>2. PySpark: \u7b49\u540c\u4e8e\u4f7f\u7528 Python \u7684 Spark\u3002<\/p>\n<p>\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u8981\u4f7fPython\u73af\u5883\u80fd\u591f\u4f7f\u7528Spark\u3002<\/p>\n<h3>2.1. \u5b89\u88c5pyspark<\/p>\n<p>\u5728\u60a8\u7684Python\u73af\u5883\u4e2d\uff0c\u6dfb\u52a0pyspark\u3002<\/p>\n<blockquote><p>\u4f7f\u7528pip\u5b89\u88c5pyspark\u3002<\/p><\/blockquote>\n<h2>2.2.\u8bbe\u7f6e\u73af\u5883\u53d8\u91cf2<\/p>\n<p>PYSPARK_PYTHON \u7684\u672c\u5730\u6b78\u7d04<\/p>\n<p>\u5c06\u8981\u4f7f\u7528\u7684Python\u73af\u5883\u8bbe\u7f6e\u4e3a\u73af\u5883\u53d8\u91cfPYSPARK_PYTHON\u3002<\/p>\n<blockquote><p>\u914d\u7f6e PYSPARK_PYTHON \u73af\u5883\u53d8\u91cf\u4e3a C:\\xxxx\\bin\\python.exe\u3002<\/p><\/blockquote>\n<h3>2.3.\u786e\u8ba4\u64cd\u4f5c<\/p>\n<p>\u8ba9\u6211\u4eec\u542f\u52a8 PySpark\u3002<\/p>\n<pre class=\"post-pre\"><code>&gt; pyspark\r\n<\/code><\/pre>\n<p>\u5982\u679c\u80fd\u591f\u6210\u529f\u542f\u52a8\uff0c\u6211\u4eec\u5c31\u6765\u7f16\u5199\u7a0b\u5e8f\u5427\u3002<\/p>\n<h2>\u8bd5\u8bd5PySpark<\/p>\n<p>\u57fa\u672c\u4e0a\uff0c\u6211\u4eec\u4f1a\u521b\u5efa\u4e00\u4e2aSparkContext\u5bf9\u8c61\u6765\u64cd\u4f5c\u6570\u636e\u3002\u5728\u8fd9\u91cc\uff0c\u6211\u4eec\u5c06\u5c1d\u8bd5\u64cd\u4f5c\u4e24\u79cd\u6570\u636e\u683c\u5f0f\u3002<\/p>\n<h3>\u8bd5\u7740\u4f7f\u7528RDD\u3002<\/p>\n<p>RDD\u662f\u6307RDD\uff08Resilient Distributed Dataset\uff09\uff0c\u5728Apache Spark\u7f16\u7a0b\u4e2d\uff0c\u57fa\u672c\u4e0a\u662f\u901a\u8fc7RDD\u6765\u6301\u6709\u548c\u64cd\u4f5c\u6570\u636e\u7684\u3002<\/p>\n<p>\u5728PySpark\u4e2d\uff0c\u6211\u5011\u5275\u5efa\u4e00\u500bSparkContext\u5c0d\u8c61\uff0c\u901a\u904e\u5b83\u4f86\u9032\u884c\u5404\u7a2e\u6578\u64da\u8655\u7406\u3002<\/p>\n<pre class=\"post-pre\"><code><span class=\"kn\">import<\/span> <span class=\"nn\">pyspark<\/span>\r\n<span class=\"kn\">from<\/span> <span class=\"nn\">pyspark<\/span> <span class=\"kn\">import<\/span> <span class=\"n\">SparkContext<\/span>\r\n\r\n<span class=\"c1\"># SparkContext\u306e\u4f5c\u6210\r\n<\/span><span class=\"n\">sc<\/span> <span class=\"o\">=<\/span> <span class=\"n\">SparkContext<\/span><span class=\"p\">(<\/span><span class=\"n\">appName<\/span><span class=\"o\">=<\/span><span class=\"s\">'spark_sample'<\/span><span class=\"p\">)<\/span>\r\n\r\n<span class=\"c1\"># RDD\u3092\u4f5c\u6210\u3059\u308b\u3002\r\n<\/span><span class=\"n\">rdd<\/span> <span class=\"o\">=<\/span> <span class=\"n\">sc<\/span><span class=\"p\">.<\/span><span class=\"n\">parallelize<\/span><span class=\"p\">([<\/span>\r\n    <span class=\"p\">(<\/span><span class=\"mi\">1<\/span><span class=\"p\">,<\/span> <span class=\"s\">'Foo'<\/span><span class=\"p\">),<\/span>\r\n    <span class=\"p\">(<\/span><span class=\"mi\">2<\/span><span class=\"p\">,<\/span> <span class=\"s\">'Bar'<\/span><span class=\"p\">),<\/span>\r\n    <span class=\"p\">(<\/span><span class=\"mi\">3<\/span><span class=\"p\">,<\/span> <span class=\"s\">'Baz'<\/span><span class=\"p\">),<\/span>\r\n<span class=\"p\">])<\/span>\r\n\r\n<span class=\"c1\"># RDD\u306b\u304b\u3051\u308bFilter\u95a2\u6570(2\u4ee5\u4e0a\u306e\u8981\u7d20\u3092\u53d6\u308a\u51fa\u3059)\r\n<\/span><span class=\"k\">def<\/span> <span class=\"nf\">filter_func<\/span><span class=\"p\">(<\/span><span class=\"n\">x<\/span><span class=\"p\">):<\/span>\r\n    <span class=\"n\">n<\/span><span class=\"p\">,<\/span><span class=\"n\">s<\/span> <span class=\"o\">=<\/span> <span class=\"n\">x<\/span>\r\n    <span class=\"k\">return<\/span> <span class=\"n\">n<\/span> <span class=\"o\">&gt;=<\/span> <span class=\"mi\">2<\/span>\r\n\r\n<span class=\"n\">rdd<\/span> <span class=\"o\">=<\/span> <span class=\"n\">rdd<\/span><span class=\"p\">.<\/span><span class=\"nb\">filter<\/span><span class=\"p\">(<\/span><span class=\"n\">filter_func<\/span><span class=\"p\">)<\/span>\r\n<span class=\"c1\"># \u7d50\u679c\u306e\u8868\u793a\r\n<\/span><span class=\"k\">print<\/span><span class=\"p\">(<\/span><span class=\"n\">rdd<\/span><span class=\"p\">.<\/span><span class=\"n\">collect<\/span><span class=\"p\">())<\/span>\r\n<\/code><\/pre>\n<p>\u7531\u4e8eRDD\u63d0\u4f9b\u4e86\u8bf8\u5982filter\/map\u7b49\u51fd\u6570\u5f0f\u7f16\u7a0b\u4e2d\u719f\u6089\u7684\u65b9\u6cd5\uff0c\u56e0\u6b64\u53ef\u4ee5\u5b9e\u73b0\u7075\u6d3b\u7684\u64cd\u4f5c\u3002<\/p>\n<h3>\u5c1d\u8bd5\u4f7f\u7528DataFrame<\/p>\n<p>DataFrame\u662f\u4e00\u79cd\u7c7b\u4f3c\u4e8e\u6570\u636e\u5e93\u8868\u7ed3\u6784\u7684\u5e26\u6709\u5217\u7684\u8868\u683c\u7ed3\u6784\u3002\u53ef\u4ee5\u901a\u8fc7\u4e3aRDD\u63d0\u4f9b\u6570\u636e\u6a21\u5f0f\u6765\u521b\u5efaDataFrame\u3002\u8fd9\u6837\u5c31\u53ef\u4ee5\u5b9e\u73b0\u7c7b\u4f3cSQL\u7684\u67e5\u8be2\u64cd\u4f5c\u3002<\/p>\n<p>\u7d9a\u3044\u3066JSON\u3092DataFrame\u5316\u3057\u3066\u30c7\u30fc\u30bf\u51e6\u7406\u3092\u884c\u3063\u3066\u307f\u307e\u3057\u3087\u3046\u3002<\/p>\n<p>\u6837\u672c.json<br \/>\n\u6837\u672c.json<br \/>\n{&#8220;\u59d3\u540d&#8221;:&#8221;\u7231\u4e3d\u4e1d&#8221;,&#8221;\u5e74\u9f84&#8221;:20}<br \/>\n{&#8220;\u59d3\u540d&#8221;:&#8221;\u9c8d\u52c3&#8221;,&#8221;\u5e74\u9f84&#8221;:25}<\/p>\n<p>\u3055\u304d\u307b\u3069\u306fSparkContext\u3092\u30a8\u30f3\u30c8\u30ea\u3068\u3057\u3066\u4f7f\u3044\u307e\u3057\u305f\u304c\u3001__\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3084\u30c7\u30fc\u30bf\u30d5\u30ec\u30fc\u30e0\u306eAPI\u3092\u4f7f\u7528\u3059\u308b\u969b\u306e\u30a8\u30f3\u30c8\u30ea\u30dd\u30a4\u30f3\u30c8\u306f\u3001SparkSession\u3067\u3059\u3002_ SparkSession\u306f\u3001builder\u3092\u4f7f\u7528\u3057\u3066\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u4f5c\u6210\u3057\u307e\u3059\u3002<\/p>\n<pre class=\"post-pre\"><code><span class=\"n\">spark<\/span> <span class=\"o\">=<\/span> <span class=\"n\">SparkSession<\/span><span class=\"p\">.<\/span><span class=\"n\">builder<\/span> \\\r\n    <span class=\"p\">.<\/span><span class=\"n\">master<\/span><span class=\"p\">(<\/span><span class=\"s\">\"local\"<\/span><span class=\"p\">)<\/span> \\\r\n    <span class=\"p\">.<\/span><span class=\"n\">appName<\/span><span class=\"p\">(<\/span><span class=\"s\">\"AppName\"<\/span><span class=\"p\">)<\/span> \\\r\n    <span class=\"p\">.<\/span><span class=\"n\">config<\/span><span class=\"p\">(<\/span><span class=\"s\">\"spark.some.config.option\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"some-value\"<\/span><span class=\"p\">)<\/span> \\\r\n    <span class=\"p\">.<\/span><span class=\"n\">getOrCreate<\/span><span class=\"p\">()<\/span>\r\n<\/code><\/pre>\n<p>SparkSession\u4f5c\u6210\u3057\u3001\u30c7\u30fc\u30bf\u3092\u51e6\u7406\u3057\u3066\u307f\u307e\u3059\u3002JSON\u304b\u3089\u53d6\u308a\u8fbc\u3093\u3060\u5834\u5408\u3001\u81ea\u52d5\u7684\u306b\u30b9\u30ad\u30fc\u30de\u3082\u63a8\u6e2c\u3055\u308c\u3066\u3044\u307e\u3059\u306e\u3067\u3001\u4ee5\u4e0b\u3067\u306f\u8907\u6570\u884c\u304b\u3089\u306a\u308bJSON\u30c7\u30fc\u30bf\u3092\u53d6\u308a\u8fbc\u3093\u3067\u30c7\u30fc\u30bf\u30d5\u30ec\u30fc\u30e0\u3092\u4f5c\u6210\u3057\u3001\u691c\u7d22\u3092\u884c\u3063\u3066\u307f\u307e\u3059\u3002<\/p>\n<p>Python\u4ee3\u7801<\/p>\n<pre class=\"post-pre\"><code><span class=\"kn\">import<\/span> <span class=\"nn\">pyspark<\/span>\r\n<span class=\"kn\">from<\/span> <span class=\"nn\">pyspark.sql<\/span> <span class=\"kn\">import<\/span> <span class=\"n\">SparkSession<\/span>\r\n\r\n<span class=\"c1\"># SparkSession\u306e\u4f5c\u6210\r\n<\/span><span class=\"n\">spark<\/span> <span class=\"o\">=<\/span> <span class=\"n\">SparkSession<\/span><span class=\"p\">.<\/span><span class=\"n\">builder<\/span> \\\r\n    <span class=\"p\">.<\/span><span class=\"n\">master<\/span><span class=\"p\">(<\/span><span class=\"s\">\"local\"<\/span><span class=\"p\">)<\/span> \\\r\n    <span class=\"p\">.<\/span><span class=\"n\">appName<\/span><span class=\"p\">(<\/span><span class=\"s\">\"JSON SQL\"<\/span><span class=\"p\">)<\/span> \\\r\n    <span class=\"p\">.<\/span><span class=\"n\">getOrCreate<\/span><span class=\"p\">()<\/span>\r\n\r\n<span class=\"c1\"># DataFrame\u306e\u4f5c\u6210\r\n<\/span><span class=\"n\">df<\/span> <span class=\"o\">=<\/span> <span class=\"n\">spark<\/span><span class=\"p\">.<\/span><span class=\"n\">read<\/span><span class=\"p\">.<\/span><span class=\"n\">json<\/span><span class=\"p\">(<\/span><span class=\"s\">'sample.json'<\/span><span class=\"p\">)<\/span>\r\n\r\n<span class=\"c1\"># JSON\u3092\u8aad\u307f\u8fbc\u3093\u3060\u306e\u3067\u30b9\u30ad\u30fc\u30de\u3092\u78ba\u8a8d\r\n<\/span><span class=\"n\">df<\/span><span class=\"p\">.<\/span><span class=\"n\">printSchema<\/span><span class=\"p\">()<\/span>\r\n\r\n<span class=\"c1\"># SparkSession\u306b\u30c6\u30fc\u30d6\u30eb\u3068\u3057\u3066\u767b\u9332\r\n<\/span><span class=\"n\">df<\/span><span class=\"p\">.<\/span><span class=\"n\">registerTempTable<\/span><span class=\"p\">(<\/span><span class=\"s\">'people'<\/span><span class=\"p\">)<\/span>\r\n\r\n<span class=\"c1\"># Spark SQL\u306b\u3088\u308b\u691c\u7d22\r\n<\/span><span class=\"n\">selected<\/span> <span class=\"o\">=<\/span> <span class=\"n\">spark<\/span><span class=\"p\">.<\/span><span class=\"n\">sql<\/span><span class=\"p\">(<\/span><span class=\"s\">'SELECT * FROM people WHERE name==<\/span><span class=\"se\">\\\"<\/span><span class=\"s\">Alice<\/span><span class=\"se\">\\\"<\/span><span class=\"s\">'<\/span><span class=\"p\">)<\/span>\r\n<span class=\"n\">selected<\/span><span class=\"p\">.<\/span><span class=\"n\">show<\/span><span class=\"p\">()<\/span>\r\n<\/code><\/pre>\n<h2>\u8fde\u63a5\u5404\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5e93<\/p>\n<p>\u3055\u307e\u3056\u307e\u306a\u30c7\u30fc\u30bf\u30d9\u30fc\u30b9\u306b\u63a5\u7d9a\u3059\u308b\u5834\u5408\u3001connector\u3092config\u306b\u4e0e\u3048\u308b\u3053\u3068\u3067\u63a5\u7d9a\u3067\u304d\u307e\u3059\u3002\u305f\u3068\u3048\u3070\u3001\u3055\u307e\u3056\u307e\u306a\u30c7\u30fc\u30bf\u30d9\u30fc\u30b9\u306e\u63a5\u7d9a\u304c\u3001Spark Package\u3068\u3057\u3066\u63d0\u4f9b\u3055\u308c\u3066\u3044\u307e\u3059\u3002<\/p>\n<p>\u5173\u7cfb\u578b\u6570\u636e\u5e93\u7ba1\u7406\u7cfb\u7edf\uff08MySQL\uff09<\/p>\n<p>\u9996\u5148\uff0c\u6211\u4eec\u6765\u5c1d\u8bd5\u8fde\u63a5\u5230\u5173\u7cfb\u578b\u6570\u636e\u5e93\u7ba1\u7406\u7cfb\u7edf\uff08RDBMS\uff09MySQL\u3002\u60a8\u9700\u8981\u51c6\u5907MySQL\u4e13\u7528\u7684JDBC\u9a71\u52a8\u7a0b\u5e8f\u7684JAR\u6587\u4ef6\u3002<\/p>\n<p>\u4ee5\u4e0b\u304b\u3089\u3001JAR\u30d5\u30a1\u30a4\u30eb\u3092\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3057\u3066\u3001\u9069\u5f53\u306a\u4f4d\u7f6e\u306bJAR\u30d5\u30a1\u30a4\u30eb\u3092\u8a2d\u7f6e\u3057\u307e\u3059\u3002<br \/>\nhttps:\/\/jar-download.com\/artifacts\/mysql\/mysql-connector-java<\/p>\n<p>\u9700\u8981\u4f7f\u7528SparkSession\u5e94\u7528\u6b64JAR\u6587\u4ef6\u3002<\/p>\n<pre class=\"post-pre\"><code><span class=\"kn\">from<\/span> <span class=\"nn\">pyspark.sql<\/span> <span class=\"kn\">import<\/span> <span class=\"n\">SparkSession<\/span>\r\n\r\n<span class=\"n\">spark<\/span> <span class=\"o\">=<\/span> <span class=\"n\">SparkSession<\/span><span class=\"p\">.<\/span><span class=\"n\">builder<\/span><span class=\"p\">.<\/span><span class=\"n\">config<\/span><span class=\"p\">(<\/span><span class=\"s\">\"spark.jars\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"...jars<\/span><span class=\"se\">\\\\<\/span><span class=\"s\">mysql-connector-java-8.0.27.jar\"<\/span><span class=\"p\">)<\/span> \\\r\n<span class=\"p\">.<\/span><span class=\"n\">master<\/span><span class=\"p\">(<\/span><span class=\"s\">\"local\"<\/span><span class=\"p\">).<\/span><span class=\"n\">appName<\/span><span class=\"p\">(<\/span><span class=\"s\">\"PySpark_MySQL_test\"<\/span><span class=\"p\">).<\/span><span class=\"n\">getOrCreate<\/span><span class=\"p\">()<\/span>\r\n\r\n<span class=\"n\">df<\/span> <span class=\"o\">=<\/span> <span class=\"n\">spark<\/span><span class=\"p\">.<\/span><span class=\"n\">read<\/span><span class=\"p\">.<\/span><span class=\"nb\">format<\/span><span class=\"p\">(<\/span><span class=\"s\">\"jdbc\"<\/span><span class=\"p\">).<\/span><span class=\"n\">option<\/span><span class=\"p\">(<\/span><span class=\"s\">\"url\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"jdbc:mysql:\/\/localhost:3306\/my_schema\"<\/span><span class=\"p\">)<\/span> \\\r\n<span class=\"p\">.<\/span><span class=\"n\">option<\/span><span class=\"p\">(<\/span><span class=\"s\">\"driver\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"com.mysql.cj.jdbc.Driver\"<\/span><span class=\"p\">).<\/span><span class=\"n\">option<\/span><span class=\"p\">(<\/span><span class=\"s\">\"dbtable\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"shema_name\"<\/span><span class=\"p\">)<\/span> \\\r\n<span class=\"p\">.<\/span><span class=\"n\">option<\/span><span class=\"p\">(<\/span><span class=\"s\">\"user\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"xxxx\"<\/span><span class=\"p\">).<\/span><span class=\"n\">option<\/span><span class=\"p\">(<\/span><span class=\"s\">\"password\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"xxxx\"<\/span><span class=\"p\">).<\/span><span class=\"n\">load<\/span><span class=\"p\">()<\/span>\r\n\r\n<span class=\"n\">df<\/span><span class=\"p\">.<\/span><span class=\"n\">show<\/span><span class=\"p\">()<\/span>\r\n<\/code><\/pre>\n<p>MongoDB(\u975e\u5173\u7cfb\u578b\u6570\u636e\u5e93)<\/p>\n<p>\u540c\u6837, \u60a8\u4e5f\u53ef\u4ee5\u901a\u8fc7NoSQL\u6570\u636e\u5e93MongoDB\u8fdb\u884c\u8bbf\u95ee\u3002MongoDB\u9a71\u52a8\u7a0b\u5e8f\u7531Spark Package\u63d0\u4f9b, \u60a8\u53ef\u4ee5\u6309\u7167\u4ee5\u4e0b\u65b9\u5f0f\u8fdb\u884c\u8bbf\u95ee\u3002<\/p>\n<p>MongoDB\u53ef\u4ee5\u901a\u8fc7\u4ee5\u4e0b\u65b9\u5f0f\u521b\u5efaSparkSession\u3002<\/p>\n<pre class=\"post-pre\"><code><span class=\"kn\">from<\/span> <span class=\"nn\">pyspark.sql<\/span> <span class=\"kn\">import<\/span> <span class=\"n\">SparkSession<\/span>\r\n\r\n<span class=\"n\">spark<\/span> <span class=\"o\">=<\/span> <span class=\"n\">SparkSession<\/span><span class=\"p\">.<\/span><span class=\"n\">builder<\/span><span class=\"p\">.<\/span><span class=\"n\">appName<\/span><span class=\"p\">(<\/span><span class=\"s\">\"myApp\"<\/span><span class=\"p\">)<\/span> \\\r\n<span class=\"p\">.<\/span><span class=\"n\">config<\/span><span class=\"p\">(<\/span><span class=\"s\">\"spark.mongodb.input.uri\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"mongodb:\/\/localhost:27017\/db_name.collection_name\"<\/span><span class=\"p\">)<\/span> \\\r\n<span class=\"p\">.<\/span><span class=\"n\">config<\/span><span class=\"p\">(<\/span><span class=\"s\">\"spark.mongodb.output.uri\"<\/span><span class=\"p\">,<\/span> <span class=\"s\">\"mongodb:\/\/localhost:27017\/db_name.collection_name\"<\/span><span class=\"p\">)<\/span> \\\r\n<span class=\"p\">.<\/span><span class=\"n\">config<\/span><span class=\"p\">(<\/span><span class=\"s\">'spark.jars.packages'<\/span><span class=\"p\">,<\/span> <span class=\"s\">'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1'<\/span><span class=\"p\">)<\/span> \\\r\n<span class=\"p\">.<\/span><span class=\"n\">getOrCreate<\/span><span class=\"p\">()<\/span>\r\n\r\n<span class=\"n\">df<\/span> <span class=\"o\">=<\/span> <span class=\"n\">spark<\/span><span class=\"p\">.<\/span><span class=\"n\">read<\/span><span class=\"p\">.<\/span><span class=\"nb\">format<\/span><span class=\"p\">(<\/span><span class=\"s\">\"com.mongodb.spark.sql.DefaultSource\"<\/span><span class=\"p\">).<\/span><span class=\"n\">load<\/span><span class=\"p\">()<\/span>\r\n<span class=\"n\">df<\/span><span class=\"p\">.<\/span><span class=\"n\">show<\/span><span class=\"p\">()<\/span>\r\n<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\uff082022.1.24\u4fee\u6b63\uff09 \u4e3a\u4e86\u4f7fSpark\u5728Windows\u4e0a\u8fd0\u884c\uff0c\u5bf9winutils\u8fdb\u884c\u4e86\u4fee\u6b63\u3002 \u4e3a\u4e86\u5728Wi [&hellip;]<\/p>\n","protected":false},"author":5,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[227],"class_list":["post-29957","post","type-post","status-publish","format-standard","hentry","category-uncategorized","tag-227"],"yoast_head":"<!-- This site is optimized with the Yoast SEO Premium plugin v21.5 (Yoast SEO v21.5) - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248] - Blog - Silicon Cloud<\/title>\n<meta name=\"description\" content=\"\u5173\u4e8e2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]\u7684\u6280\u672f\u6587\u7ae0\" \/>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.silicloud.com\/zh\/blog\/2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728windows\u73af\u5883\u4e0b\u4f7f\u7528pyspark\u4fee\u6b63\u7248\u3002\/\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]\" \/>\n<meta property=\"og:description\" content=\"\u5173\u4e8e2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]\u7684\u6280\u672f\u6587\u7ae0\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.silicloud.com\/zh\/blog\/2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728windows\u73af\u5883\u4e0b\u4f7f\u7528pyspark\u4fee\u6b63\u7248\u3002\/\" \/>\n<meta property=\"og:site_name\" content=\"Blog - Silicon Cloud\" \/>\n<meta property=\"article:published_time\" content=\"2024-03-08T22:54:22+00:00\" \/>\n<meta property=\"article:modified_time\" content=\"2025-08-12T08:46:47+00:00\" \/>\n<meta name=\"author\" content=\"\u6e05, \u5b87\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"\u6e05, \u5b87\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"1 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/\",\"url\":\"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/\",\"name\":\"2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248] - Blog - Silicon Cloud\",\"isPartOf\":{\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/#website\"},\"datePublished\":\"2024-03-08T22:54:22+00:00\",\"dateModified\":\"2025-08-12T08:46:47+00:00\",\"author\":{\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/#\/schema\/person\/1a6ecd3d914d22a5ac32791ffc1fbd8e\"},\"description\":\"\u5173\u4e8e2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]\u7684\u6280\u672f\u6587\u7ae0\",\"breadcrumb\":{\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.silicloud.com\/zh\/blog\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/#website\",\"url\":\"https:\/\/www.silicloud.com\/zh\/blog\/\",\"name\":\"Blog - Silicon Cloud\",\"description\":\"\",\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/#\/schema\/person\/1a6ecd3d914d22a5ac32791ffc1fbd8e\",\"name\":\"\u6e05, \u5b87\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/secure.gravatar.com\/avatar\/4b2016c18459a605fc469c7566608f5686491baa112d0871ee613f61b7210565?s=96&d=mm&r=g\",\"contentUrl\":\"https:\/\/secure.gravatar.com\/avatar\/4b2016c18459a605fc469c7566608f5686491baa112d0871ee613f61b7210565?s=96&d=mm&r=g\",\"caption\":\"\u6e05, \u5b87\"},\"url\":\"https:\/\/www.silicloud.com\/zh\/blog\/author\/qingyu\/\"},{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/#local-main-organization-logo\",\"url\":\"\",\"contentUrl\":\"\",\"caption\":\"Blog - Silicon Cloud\"}]}<\/script>\n<!-- \/ Yoast SEO Premium plugin. -->","yoast_head_json":{"title":"2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248] - Blog - Silicon Cloud","description":"\u5173\u4e8e2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]\u7684\u6280\u672f\u6587\u7ae0","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.silicloud.com\/zh\/blog\/2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728windows\u73af\u5883\u4e0b\u4f7f\u7528pyspark\u4fee\u6b63\u7248\u3002\/","og_locale":"zh_CN","og_type":"article","og_title":"2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]","og_description":"\u5173\u4e8e2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]\u7684\u6280\u672f\u6587\u7ae0","og_url":"https:\/\/www.silicloud.com\/zh\/blog\/2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728windows\u73af\u5883\u4e0b\u4f7f\u7528pyspark\u4fee\u6b63\u7248\u3002\/","og_site_name":"Blog - Silicon Cloud","article_published_time":"2024-03-08T22:54:22+00:00","article_modified_time":"2025-08-12T08:46:47+00:00","author":"\u6e05, \u5b87","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"\u6e05, \u5b87","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"1 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/","url":"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/","name":"2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248] - Blog - Silicon Cloud","isPartOf":{"@id":"https:\/\/www.silicloud.com\/zh\/blog\/#website"},"datePublished":"2024-03-08T22:54:22+00:00","dateModified":"2025-08-12T08:46:47+00:00","author":{"@id":"https:\/\/www.silicloud.com\/zh\/blog\/#\/schema\/person\/1a6ecd3d914d22a5ac32791ffc1fbd8e"},"description":"\u5173\u4e8e2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]\u7684\u6280\u672f\u6587\u7ae0","breadcrumb":{"@id":"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.silicloud.com\/zh\/blog\/"},{"@type":"ListItem","position":2,"name":"2022\u5e741\u670824\u65e5\uff0c\u8ba9\u6211\u4eec\u5728Windows\u73af\u5883\u4e0b\u4f7f\u7528PySpark[\u4fee\u6b63\u7248]"}]},{"@type":"WebSite","@id":"https:\/\/www.silicloud.com\/zh\/blog\/#website","url":"https:\/\/www.silicloud.com\/zh\/blog\/","name":"Blog - Silicon Cloud","description":"","inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.silicloud.com\/zh\/blog\/#\/schema\/person\/1a6ecd3d914d22a5ac32791ffc1fbd8e","name":"\u6e05, \u5b87","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.silicloud.com\/zh\/blog\/#\/schema\/person\/image\/","url":"https:\/\/secure.gravatar.com\/avatar\/4b2016c18459a605fc469c7566608f5686491baa112d0871ee613f61b7210565?s=96&d=mm&r=g","contentUrl":"https:\/\/secure.gravatar.com\/avatar\/4b2016c18459a605fc469c7566608f5686491baa112d0871ee613f61b7210565?s=96&d=mm&r=g","caption":"\u6e05, \u5b87"},"url":"https:\/\/www.silicloud.com\/zh\/blog\/author\/qingyu\/"},{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.silicloud.com\/zh\/blog\/2022%e5%b9%b41%e6%9c%8824%e6%97%a5%ef%bc%8c%e8%ae%a9%e6%88%91%e4%bb%ac%e5%9c%a8windows%e7%8e%af%e5%a2%83%e4%b8%8b%e4%bd%bf%e7%94%a8pyspark%e4%bf%ae%e6%ad%a3%e7%89%88%e3%80%82\/#local-main-organization-logo","url":"","contentUrl":"","caption":"Blog - Silicon Cloud"}]}},"_links":{"self":[{"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/posts\/29957","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/users\/5"}],"replies":[{"embeddable":true,"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/comments?post=29957"}],"version-history":[{"count":2,"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/posts\/29957\/revisions"}],"predecessor-version":[{"id":111620,"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/posts\/29957\/revisions\/111620"}],"wp:attachment":[{"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/media?parent=29957"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/categories?post=29957"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.silicloud.com\/zh\/blog\/wp-json\/wp\/v2\/tags?post=29957"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}